diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/LICENSE b/PyTorch/contrib/cv/semantic_segmentation/DPT/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..17e19b8d4874a7c5b1541ccaa4e217cb312d2d65
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) OpenMMLab. All rights reserved
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 Huawei Technologies Co., Ltd
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/README.md b/PyTorch/contrib/cv/semantic_segmentation/DPT/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fda23c2ec773a23373c15d3439cd6bbb023a8c07
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/README.md
@@ -0,0 +1,200 @@
+# DPT for PyTorch
+
+-   [概述](概述.md)
+-   [准备训练环境](准备训练环境.md)
+-   [开始训练](开始训练.md)
+-   [训练结果展示](训练结果展示.md)
+-   [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+DPT是一种密集的预测体系结构，它基于编码器-解码器的设计，利用transformer作为编码器的基本计算构建块。使用 ViT 作为 encoder 结构，把原图切分为不重叠的 token，然后使用 MHSA 获得这些经过编码的 token 之间的 attention。transformer 处理后，token 的数量是不变的，且它们之间的 attention 是一对一的，每个 token 都可以获得和其他 token 的关系，能够获得全局感受野下的特征，空间分辨率也不会改变。
+
+
+- 参考实现：
+
+  ```
+  url=https://github.com/open-mmlab/mmsegmentation/tree/master/configs/dpt
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/classification
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone {https://github.com/open-mmlab/mmsegmentation}       # 克隆仓库的代码
+  cd {mmsegmentation}        # 切换到/mmsegmentaion目录下
+  ```
+  
+- 通过单击“[立即下载](https://github.com/open-mmlab/mmsegmentation/archive/refs/heads/master.zip)”，下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+  **表 1**  版本配套表
+
+  | 配套       | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动 | [1.0.15](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN       | [5.1.RC1](https://www.hiascend.com/software/cann/commercial?version=5.1.RC1) |
+  | PyTorch    | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖（根据模型需求，按需添加所需依赖）。
+
+  ```
+  pip install -r requirements.txt
+  ```
+  构建mmcv.  
+  ```
+  # 克隆mmcv仓库代码
+  git clone -b v1.4.4 https://github.com/open-mmlab/mmcv.git  
+       
+  # configure
+  cd /mmcv
+
+  # copy
+  rm -rf ./mmcv
+  mkdir mmcv
+  cp -r mmcv_replace/* ./mmcv/
+
+  # compile
+  MMCV_WITH_OPS=1 pip install -e . -v 
+  
+  cd /${模型文件夹名称}
+  ```
+  构建mmsegmentation.  
+  用本页configs/下文件替换mmsegmentation/configs同名文件，mmseg/下文件替换/mmsegmentation/mmseg/下的的同名文件, ./tools/下文件替换/mmsegmentation/tools下同名文件
+  ```
+  # 克隆仓库的代码
+  git clone https://github.com/open-mmlab/mmsegmentation    
+
+  # configure
+  cd /mmsegmentation 
+
+  # copy
+  cp -r /${你的存放路径}/configs/* ./configs/
+  cp -r /${你的存放路径}/mmseg/* ./mmseg/
+  cp -r /${你的存放路径}/tools/* ./tools/
+
+  # compile
+  pip install -e . -v 
+  ```
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户自行获取原始数据集，可选用的开源数据集为[ade20k](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip)，将数据集上传到服务器任意路径下并解压。
+
+   数据集目录结构如下所示：
+   ```
+   ├──ADE20K/
+   |   |──annotations/
+   |   |   |──training/
+   |   |   |    ADE_train_00000001.png
+   |   |   |    ADE_train_00000002.png
+   |   |   |    ...
+   |   |   |──validation/
+   |   |   |    ADE_val_00000001.png
+   |   |   |    ADE_val_00000002.png
+   |   |   |    ...
+   |   |──images/
+   |   |   |──training/
+   |   |   |    ADE_train_00000001.jpg
+   |   |   |    ADE_train_00000002.jpg
+   |   |   |    ...
+   |   |   |──validation/
+   |   |   |    ADE_val_00000001.jpg
+   |   |   |    ADE_val_00000002.jpg
+   |   |   |    ...
+   |   |──objectInfo150.txt
+   |   |──sceneCategories.txt
+   ```
+
+## 获取预训练模型
+
+请参考原始仓库上的README.md进行预训练模型获取或[点击这里](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth)。将获取的预训练模型放至在根目录下/pretrain，接着对模型进行转换
+```
+python mmsegmentation/tools/model_converters/vit2mmseg.py pretrain/jx_vit_base_p16_224-80ecf9dd.pth pretrain/vit_base_p16_224-80ecf9dd.pth
+```
+
+## 修改
+- torch.nn.parallel._function._get_stream中使用了torch.cuda，修改为torch.npu
+
+# 开始训练
+
+## 训练模型
+
+1. 进入根目录。
+
+   ```
+   cd .
+   ```
+
+2. 运行训练脚本。
+
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡训练
+
+     启动单卡训练。
+
+     ```
+     # training 1p accuracy
+     bash ./test/train_full_1p.sh --data_path=xxx
+     # training 1p performance
+     bash ./test/train_performance_1p.sh --data_path=xxx
+     ```
+
+   - 单机8卡训练
+
+     启动8卡训练。
+
+     ```
+     # training 8p accuracy
+     bash ./test/train_full_8p.sh --data_path=xxx
+     # training 8p performance
+     bash ./test/train_performance_8p.sh --data_path=xxx
+     ```
+
+   训练完成后，权重文件默认保存在/work_dir下，并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+| NAME    | decode.acc_seg |  FPS | iters | AMP_Type |
+| ------- | ----- | ---: | ------ | -------: |
+| 1p-GPU | | 5.98 | 500 | O1,None |
+| 1p-NPU | | 0.09 | 500 | O1,None |
+| 8p-GPU | 80.2740 | 37.20| 7500   |        O1,None |
+| 8p-NPU  | 81.0251 | 0.57 | 7500    |       O1,None |
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/datasets/ade20k.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e9cfa42ad297475d230525b585678a72183174
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,69 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = '/home/savepath/yuhaiyan/mmsegmentation/data/ad20k/ADEChallengeData2016'  #'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 512),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/training',
+        ann_dir='annotations/training',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/default_runtime.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..48926a1de92f76ce685d0c9726cc12b46c220d17
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/default_runtime.py
@@ -0,0 +1,30 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        # dict(type='TensorboardLoggerHook')
+        # dict(type='PaviLoggerHook') # for internal services
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+cudnn_benchmark = True
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/models/dpt_vit-b16.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/models/dpt_vit-b16.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d10c3ee4a637e0f112e9fae9ddf969d1afbd2a2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/models/dpt_vit-b16.py
@@ -0,0 +1,45 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=224,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        out_indices=(2, 5, 8, 11),
+        final_norm=False,
+        with_cls_token=True,
+        output_cls_token=True),
+    decode_head=dict(
+        type='DPTHead',
+        in_channels=(768, 768, 768, 768),
+        channels=256,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        num_classes=150,
+        readout_type='project',
+        input_transform='multiple_select',
+        in_index=(0, 1, 2, 3),
+        norm_cfg=norm_cfg,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=None,
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/schedules/schedule_160k.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b35ca427b8c14ee8973ed6c56f0a1e822a5b008
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=16000)
+evaluation = dict(interval=16000, metric='mIoU', pre_eval=True)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/README.md b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e6257711fc6979a9d8bb50c0577784842b1a8a0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/README.md
@@ -0,0 +1,67 @@
+# DPT
+
+[Vision Transformer for Dense Prediction](https://arxiv.org/abs/2103.13413)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/isl-org/DPT">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. Our models are available at [this https URL](https://github.com/isl-org/DPT).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901057-00aabea5-dab4-43d3-a14a-5f73eb5dd9b9.png" width="80%"/>
+</div>
+
+## Citation
+
+```bibtex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+
+@article{Ranftl2021,
+  author    = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
+  title     = {Vision Transformers for Dense Prediction},
+  journal   = {ArXiv preprint},
+  year      = {2021},
+}
+```
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`vit2mmseg.py`](../../tools/model_converters/vit2mmseg.py) in the tools directory to convert the key of models from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to MMSegmentation style.
+
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth pretrain/jx_vit_base_p16_224-80ecf9dd.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DPT    | ViT-B    | 512x512   |  160000 | 8.09     | 10.41          | 46.97 |         48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt.yml b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a4f9c65b790532b875669d4f0459ada8907af8db
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt.yml
@@ -0,0 +1,37 @@
+Collections:
+- Name: DPT
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    URL: https://arxiv.org/abs/2103.13413
+    Title: Vision Transformer for Dense Prediction
+  README: configs/dpt/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215
+    Version: v0.17.0
+  Converted From:
+    Code: https://github.com/isl-org/DPT
+Models:
+- Name: dpt_vit-b16_512x512_160k_ade20k
+  In Collection: DPT
+  Metadata:
+    backbone: ViT-B
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 96.06
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 8.09
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.97
+      mIoU(ms+flip): 48.34
+  Config: configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..75d23200ed29bbe55e32327c4b1691dca763e0d4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
@@ -0,0 +1,47 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+_base_ = [
+    '../_base_/models/dpt_vit-b16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.00006,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+lr_config = dict(
+    _delete_=True,
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-6,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2, workers_per_gpu=2)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/env_npu.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/env_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..abc92e4ae9ef638ea511eff36d8e0448d896695c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/env_npu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=0 
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=0 
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启2个非连续combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置是否开启3个非连续combined标志,0-关闭/1-开启
+export TRI_COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+# HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+# HCCL默认超时时间120s较少，修改为1800s对齐PyTorch默认设置
+export HCCL_CONNECT_TIMEOUT=1800
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
+
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b9388db0f1710c14411f1104c44ba129dbaf0e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/__init__.py
@@ -0,0 +1,28 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .arraymisc import *
+from .fileio import *
+from .image import *
+from .utils import *
+from .version import *
+from .video import *
+from .visualization import *
+
+# The following modules are not imported to this level, so mmcv may be used
+# without PyTorch.
+# - runner
+# - parallel
+# - op
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd56054acbdcaf8e30061c48217eaf85868b804f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/__init__.py
@@ -0,0 +1,17 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .quantization import dequantize, quantize
+
+__all__ = ['quantize', 'dequantize']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/quantization.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..363b2f997446009c3aa492f3a30d8c4595127696
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/arraymisc/quantization.py
@@ -0,0 +1,68 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+
+def quantize(arr, min_val, max_val, levels, dtype=np.int64):
+    """Quantize an array of (-inf, inf) to [0, levels-1].
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (scalar): Minimum value to be clipped.
+        max_val (scalar): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the quantized array.
+
+    Returns:
+        tuple: Quantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    arr = np.clip(arr, min_val, max_val) - min_val
+    quantized_arr = np.minimum(
+        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
+
+    return quantized_arr
+
+
+def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
+    """Dequantize an array.
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (scalar): Minimum value to be clipped.
+        max_val (scalar): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the dequantized array.
+
+    Returns:
+        tuple: Dequantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
+                                                   min_val) / levels + min_val
+
+    return dequantized_arr
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc707b6886a4c6b05214f9a3888726cfc6233e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/__init__.py
@@ -0,0 +1,54 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .alexnet import AlexNet
+# yapf: disable
+from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                     PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
+                     ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
+                     DepthwiseSeparableConvModule, GeneralizedAttention,
+                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
+                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
+                     build_activation_layer, build_conv_layer,
+                     build_norm_layer, build_padding_layer, build_plugin_layer,
+                     build_upsample_layer, conv_ws_2d, is_norm)
+from .builder import MODELS, build_model_from_cfg
+# yapf: enable
+from .resnet import ResNet, make_res_layer
+from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
+                    NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
+                    XavierInit, bias_init_with_prob, caffe2_xavier_init,
+                    constant_init, fuse_conv_bn, get_model_complexity_info,
+                    initialize, kaiming_init, normal_init, trunc_normal_init,
+                    uniform_init, xavier_init)
+from .vgg import VGG, make_vgg_layer
+
+__all__ = [
+    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
+    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
+    'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
+    'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
+    'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
+    'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
+    'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
+    'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+    'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
+    'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
+    'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/alexnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..827157f9eaca3f71630bb1bfa2c0979ba254eb1e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/alexnet.py
@@ -0,0 +1,74 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import torch.nn as nn
+
+
+class AlexNet(nn.Module):
+    """AlexNet backbone.
+
+    Args:
+        num_classes (int): number of classes for classification.
+    """
+
+    def __init__(self, num_classes=-1):
+        super(AlexNet, self).__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # use default initializer
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..735d51caab3a3217ec14101268d05df196c610e3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/__init__.py
@@ -0,0 +1,48 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .activation import build_activation_layer
+from .context_block import ContextBlock
+from .conv import build_conv_layer
+from .conv2d_adaptive_padding import Conv2dAdaptivePadding
+from .conv_module import ConvModule
+from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
+from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
+from .generalized_attention import GeneralizedAttention
+from .hsigmoid import HSigmoid
+from .hswish import HSwish
+from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
+from .norm import build_norm_layer, is_norm
+from .padding import build_padding_layer
+from .plugin import build_plugin_layer
+from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                       PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
+from .scale import Scale
+from .swish import Swish
+from .upsample import build_upsample_layer
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                       Linear, MaxPool2d, MaxPool3d)
+
+__all__ = [
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
+    'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
+    'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
+    'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
+    'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
+    'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/activation.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..224b5f88429c9925eeb4d5dcca5455b03877d541
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/activation.py
@@ -0,0 +1,106 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
+from .registry import ACTIVATION_LAYERS
+
+for module in [
+        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+        nn.Sigmoid, nn.Tanh
+]:
+    ACTIVATION_LAYERS.register_module(module=module)
+
+
+@ACTIVATION_LAYERS.register_module(name='Clip')
+@ACTIVATION_LAYERS.register_module()
+class Clamp(nn.Module):
+    """Clamp activation layer.
+
+    This activation function is to clamp the feature map value within
+    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+    Args:
+        min (Number | optional): Lower-bound of the range to be clamped to.
+            Default to -1.
+        max (Number | optional): Upper-bound of the range to be clamped to.
+            Default to 1.
+    """
+
+    def __init__(self, min=-1., max=1.):
+        super(Clamp, self).__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: Clamped tensor.
+        """
+        return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math::
+        \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for
+    Gaussian Distribution.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.gelu(input)
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.4')):
+    ACTIVATION_LAYERS.register_module(module=GELU)
+else:
+    ACTIVATION_LAYERS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg):
+    """Build activation layer.
+
+    Args:
+        cfg (dict): The activation layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an activation layer.
+
+    Returns:
+        nn.Module: Created activation layer.
+    """
+    return build_from_cfg(cfg, ACTIVATION_LAYERS)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/context_block.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..727112b94c8b2a20bd34e6f0af306d9e8adad9ea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/context_block.py
@@ -0,0 +1,138 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+
+from ..utils import constant_init, kaiming_init
+from .registry import PLUGIN_LAYERS
+
+
+def last_zero_init(m):
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+
+
+@PLUGIN_LAYERS.register_module()
+class ContextBlock(nn.Module):
+    """ContextBlock module in GCNet.
+
+    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    (https://arxiv.org/abs/1904.11492) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        ratio (float): Ratio of channels of transform bottleneck
+        pooling_type (str): Pooling method for context modeling.
+            Options are 'att' and 'avg', stand for attention pooling and
+            average pooling respectively. Default: 'att'.
+        fusion_types (Sequence[str]): Fusion method for feature fusion,
+            Options are 'channels_add', 'channel_mul', stand for channelwise
+            addition and multiplication respectively. Default: ('channel_add',)
+    """
+
+    _abbr_ = 'context_block'
+
+    def __init__(self,
+                 in_channels,
+                 ratio,
+                 pooling_type='att',
+                 fusion_types=('channel_add', )):
+        super(ContextBlock, self).__init__()
+        assert pooling_type in ['avg', 'att']
+        assert isinstance(fusion_types, (list, tuple))
+        valid_fusion_types = ['channel_add', 'channel_mul']
+        assert all([f in valid_fusion_types for f in fusion_types])
+        assert len(fusion_types) > 0, 'at least one fusion should be used'
+        self.in_channels = in_channels
+        self.ratio = ratio
+        self.planes = int(in_channels * ratio)
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        if pooling_type == 'att':
+            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+            self.softmax = nn.Softmax(dim=2)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        if 'channel_add' in fusion_types:
+            self.channel_add_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_add_conv = None
+        if 'channel_mul' in fusion_types:
+            self.channel_mul_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_mul_conv = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.pooling_type == 'att':
+            kaiming_init(self.conv_mask, mode='fan_in')
+            self.conv_mask.inited = True
+
+        if self.channel_add_conv is not None:
+            last_zero_init(self.channel_add_conv)
+        if self.channel_mul_conv is not None:
+            last_zero_init(self.channel_mul_conv)
+
+    def spatial_pool(self, x):
+        batch, channel, height, width = x.size()
+        if self.pooling_type == 'att':
+            input_x = x
+            # [N, C, H * W]
+            input_x = input_x.view(batch, channel, height * width)
+            # [N, 1, C, H * W]
+            input_x = input_x.unsqueeze(1)
+            # [N, 1, H, W]
+            context_mask = self.conv_mask(x)
+            # [N, 1, H * W]
+            context_mask = context_mask.view(batch, 1, height * width)
+            # [N, 1, H * W]
+            context_mask = self.softmax(context_mask)
+            # [N, 1, H * W, 1]
+            context_mask = context_mask.unsqueeze(-1)
+            # [N, 1, C, 1]
+            context = torch.matmul(input_x, context_mask)
+            # [N, C, 1, 1]
+            context = context.view(batch, channel, 1, 1)
+        else:
+            # [N, C, 1, 1]
+            context = self.avg_pool(x)
+
+        return context
+
+    def forward(self, x):
+        # [N, C, 1, 1]
+        context = self.spatial_pool(x)
+
+        out = x
+        if self.channel_mul_conv is not None:
+            # [N, C, 1, 1]
+            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+            out = out * channel_mul_term
+        if self.channel_add_conv is not None:
+            # [N, C, 1, 1]
+            channel_add_term = self.channel_add_conv(context)
+            out = out + channel_add_term
+
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e223592f40bafd52fbf895b604270373557d993
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv.py
@@ -0,0 +1,57 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import nn
+
+from .registry import CONV_LAYERS
+
+CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
+CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
+CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
+CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg, *args, **kwargs):
+    """Build convolution layer.
+
+    Args:
+        cfg (None or dict): The conv layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an conv layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created conv layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv2d')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in CONV_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+    else:
+        conv_layer = CONV_LAYERS.get(layer_type)
+
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv2d_adaptive_padding.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a1076e8bb01f793a5882823b5dab70e0b293b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv2d_adaptive_padding.py
@@ -0,0 +1,75 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+from torch import nn
+from torch.nn import functional as F
+
+from .registry import CONV_LAYERS
+
+
+@CONV_LAYERS.register_module()
+class Conv2dAdaptivePadding(nn.Conv2d):
+    """Implementation of 2D convolution in tensorflow with `padding` as "same",
+    which applies padding to input (if needed) so that input image gets fully
+    covered by filter and stride you specified. For stride 1, this will ensure
+    that output image size is same as input. For stride of 2, output dimensions
+    will be half, for example.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+
+    def forward(self, x):
+        img_h, img_w = x.size()[-2:]
+        kernel_h, kernel_w = self.weight.size()[-2:]
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(img_h / stride_h)
+        output_w = math.ceil(img_w / stride_w)
+        pad_h = (
+            max((output_h - 1) * self.stride[0] +
+                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
+        pad_w = (
+            max((output_w - 1) * self.stride[1] +
+                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_module.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0d96585f1f7f40e5e600f63a29043c04f4ba1a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_module.py
@@ -0,0 +1,219 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+
+from mmcv.utils import _BatchNorm, _InstanceNorm
+from ..utils import constant_init, kaiming_init
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+from .registry import PLUGIN_LAYERS
+
+
+@PLUGIN_LAYERS.register_module()
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+
+    _abbr_ = 'conv_block'
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias='auto',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 inplace=True,
+                 with_spectral_norm=False,
+                 padding_mode='zeros',
+                 order=('conv', 'norm', 'act')):
+        super(ConvModule, self).__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        official_padding_mode = ['zeros', 'circular']
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == set(['conv', 'norm', 'act'])
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            pad_cfg = dict(type=padding_mode)
+            self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.add_module(self.norm_name, norm)
+            if self.with_bias:
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn(
+                        'Unnecessary conv bias before batch/instance norm')
+        else:
+            self.norm_name = None
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, 'init_weights'):
+            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+                nonlinearity = 'leaky_relu'
+                a = self.act_cfg.get('negative_slope', 0.01)
+            else:
+                nonlinearity = 'relu'
+                a = 0
+            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self, x, activate=True, norm=True):
+        for layer in self.order:
+            if layer == 'conv':
+                if self.with_explicit_padding:
+                    x = self.padding_layer(x)
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+        return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_ws.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_ws.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a23a79fc9fd468902af922677fe7e0a1d36a51
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/conv_ws.py
@@ -0,0 +1,161 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .registry import CONV_LAYERS
+
+
+def conv_ws_2d(input,
+               weight,
+               bias=None,
+               stride=1,
+               padding=0,
+               dilation=1,
+               groups=1,
+               eps=1e-5):
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+@CONV_LAYERS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 eps=1e-5):
+        super(ConvWS2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+
+    def forward(self, x):
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
+
+
+@CONV_LAYERS.register_module(name='ConvAWS')
+class ConvAWS2d(nn.Conv2d):
+    """AWS (Adaptive Weight Standardization)
+
+    This is a variant of Weight Standardization
+    (https://arxiv.org/pdf/1903.10520.pdf)
+    It is used in DetectoRS to avoid NaN
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the conv kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If set True, adds a learnable bias to the
+            output. Default: True
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.register_buffer('weight_gamma',
+                             torch.ones(self.out_channels, 1, 1, 1))
+        self.register_buffer('weight_beta',
+                             torch.zeros(self.out_channels, 1, 1, 1))
+
+    def _get_weight(self, weight):
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        weight = (weight - mean) / std
+        weight = self.weight_gamma * weight + self.weight_beta
+        return weight
+
+    def forward(self, x):
+        weight = self._get_weight(self.weight)
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Override default load function.
+
+        AWS overrides the function _load_from_state_dict to recover
+        weight_gamma and weight_beta if they are missing. If weight_gamma and
+        weight_beta are found in the checkpoint, this function will return
+        after super()._load_from_state_dict. Otherwise, it will compute the
+        mean and std of the pretrained weights and store them in weight_beta
+        and weight_gamma.
+        """
+
+        self.weight_gamma.data.fill_(-1)
+        local_missing_keys = []
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, local_missing_keys,
+                                      unexpected_keys, error_msgs)
+        if self.weight_gamma.data.mean() > 0:
+            for k in local_missing_keys:
+                missing_keys.append(k)
+            return
+        weight = self.weight.data
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        self.weight_beta.data.copy_(mean)
+        self.weight_gamma.data.copy_(std)
+        missing_gamma_beta = [
+            k for k in local_missing_keys
+            if k.endswith('weight_gamma') or k.endswith('weight_beta')
+        ]
+        for k in missing_gamma_beta:
+            local_missing_keys.remove(k)
+        for k in local_missing_keys:
+            missing_keys.append(k)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/depthwise_separable_conv_module.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/depthwise_separable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..a211ec76cab8a414eb15c29933e891bcfad7bb40
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/depthwise_separable_conv_module.py
@@ -0,0 +1,109 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+
+from .conv_module import ConvModule
+
+
+class DepthwiseSeparableConvModule(nn.Module):
+    """Depthwise separable convolution module.
+
+    See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+    This module can replace a ConvModule with the conv block replaced by two
+    conv block: depthwise conv block and pointwise conv block. The depthwise
+    conv block contains depthwise-conv/norm/activation layers. The pointwise
+    conv block contains pointwise-conv/norm/activation layers. It should be
+    noted that there will be norm/activation layer in the depthwise conv block
+    if `norm_cfg` and `act_cfg` are specified.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``. Default: 0.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        norm_cfg (dict): Default norm config for both depthwise ConvModule and
+            pointwise ConvModule. Default: None.
+        act_cfg (dict): Default activation config for both depthwise ConvModule
+            and pointwise ConvModule. Default: dict(type='ReLU').
+        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        kwargs (optional): Other shared arguments for depthwise and pointwise
+            ConvModule. See ConvModule for ref.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 dw_norm_cfg='default',
+                 dw_act_cfg='default',
+                 pw_norm_cfg='default',
+                 pw_act_cfg='default',
+                 **kwargs):
+        super(DepthwiseSeparableConvModule, self).__init__()
+        assert 'groups' not in kwargs, 'groups should not be specified'
+
+        # if norm/activation config of depthwise/pointwise ConvModule is not
+        # specified, use default config.
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
+        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
+        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
+
+        # depthwise convolution
+        self.depthwise_conv = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            norm_cfg=dw_norm_cfg,
+            act_cfg=dw_act_cfg,
+            **kwargs)
+
+        self.pointwise_conv = ConvModule(
+            in_channels,
+            out_channels,
+            1,
+            norm_cfg=pw_norm_cfg,
+            act_cfg=pw_act_cfg,
+            **kwargs)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/drop.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b144a36951829a77f944368029eaaaa8a20860
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/drop.py
@@ -0,0 +1,78 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+from mmcv import build_from_cfg
+from .registry import DROPOUT_LAYERS
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+
+
+@DROPOUT_LAYERS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+
+    def __init__(self, drop_prob=0.1):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@DROPOUT_LAYERS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+
+    def __init__(self, drop_prob=0.5, inplace=False):
+        super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg, default_args=None):
+    """Builder for drop out layers."""
+    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/generalized_attention.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55547b8eec99be7878928ffc473045f87892052
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/generalized_attention.py
@@ -0,0 +1,425 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import kaiming_init
+from .registry import PLUGIN_LAYERS
+
+
+@PLUGIN_LAYERS.register_module()
+class GeneralizedAttention(nn.Module):
+    """GeneralizedAttention module.
+
+    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    (https://arxiv.org/abs/1711.07971) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        spatial_range (int): The spatial range. -1 indicates no spatial range
+            constraint. Default: -1.
+        num_heads (int): The head number of empirical_attention module.
+            Default: 9.
+        position_embedding_dim (int): The position embedding dimension.
+            Default: -1.
+        position_magnitude (int): A multiplier acting on coord difference.
+            Default: 1.
+        kv_stride (int): The feature stride acting on key/value feature map.
+            Default: 2.
+        q_stride (int): The feature stride acting on query feature map.
+            Default: 1.
+        attention_type (str): A binary indicator string for indicating which
+            items in generalized empirical_attention module are used.
+            Default: '1111'.
+
+            - '1000' indicates 'query and key content' (appr - appr) item,
+            - '0100' indicates 'query content and relative position'
+              (appr - position) item,
+            - '0010' indicates 'key content only' (bias - appr) item,
+            - '0001' indicates 'relative position only' (bias - position) item.
+    """
+
+    _abbr_ = 'gen_attention_block'
+
+    def __init__(self,
+                 in_channels,
+                 spatial_range=-1,
+                 num_heads=9,
+                 position_embedding_dim=-1,
+                 position_magnitude=1,
+                 kv_stride=2,
+                 q_stride=1,
+                 attention_type='1111'):
+
+        super(GeneralizedAttention, self).__init__()
+
+        # hard range means local range for non-local operation
+        self.position_embedding_dim = (
+            position_embedding_dim
+            if position_embedding_dim > 0 else in_channels)
+
+        self.position_magnitude = position_magnitude
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.spatial_range = spatial_range
+        self.kv_stride = kv_stride
+        self.q_stride = q_stride
+        self.attention_type = [bool(int(_)) for _ in attention_type]
+        self.qk_embed_dim = in_channels // num_heads
+        out_c = self.qk_embed_dim * num_heads
+
+        if self.attention_type[0] or self.attention_type[1]:
+            self.query_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.query_conv.kaiming_init = True
+
+        if self.attention_type[0] or self.attention_type[2]:
+            self.key_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.key_conv.kaiming_init = True
+
+        self.v_dim = in_channels // num_heads
+        self.value_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=self.v_dim * num_heads,
+            kernel_size=1,
+            bias=False)
+        self.value_conv.kaiming_init = True
+
+        if self.attention_type[1] or self.attention_type[3]:
+            self.appr_geom_fc_x = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_x.kaiming_init = True
+
+            self.appr_geom_fc_y = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_y.kaiming_init = True
+
+        if self.attention_type[2]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.appr_bias = nn.Parameter(appr_bias_value)
+
+        if self.attention_type[3]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.geom_bias = nn.Parameter(geom_bias_value)
+
+        self.proj_conv = nn.Conv2d(
+            in_channels=self.v_dim * num_heads,
+            out_channels=in_channels,
+            kernel_size=1,
+            bias=True)
+        self.proj_conv.kaiming_init = True
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        if self.spatial_range >= 0:
+            # only works when non local is after 3*3 conv
+            if in_channels == 256:
+                max_len = 84
+            elif in_channels == 512:
+                max_len = 42
+
+            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+            local_constraint_map = np.ones(
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+            for iy in range(max_len):
+                for ix in range(max_len):
+                    local_constraint_map[
+                        iy, ix,
+                        max((iy - self.spatial_range) //
+                            self.kv_stride, 0):min((iy + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len),
+                        max((ix - self.spatial_range) //
+                            self.kv_stride, 0):min((ix + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len)] = 0
+
+            self.local_constraint_map = nn.Parameter(
+                torch.from_numpy(local_constraint_map).byte(),
+                requires_grad=False)
+
+        if self.q_stride > 1:
+            self.q_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.q_stride)
+        else:
+            self.q_downsample = None
+
+        if self.kv_stride > 1:
+            self.kv_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.kv_stride)
+        else:
+            self.kv_downsample = None
+
+        self.init_weights()
+
+    def get_position_embedding(self,
+                               h,
+                               w,
+                               h_kv,
+                               w_kv,
+                               q_stride,
+                               kv_stride,
+                               device,
+                               dtype,
+                               feat_dim,
+                               wave_length=1000):
+        # the default type of Tensor is float32, leading to type mismatch
+        # in fp16 mode. Cast it to support fp16 mode.
+        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
+        h_idxs = h_idxs.view((h, 1)) * q_stride
+
+        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
+        w_idxs = w_idxs.view((w, 1)) * q_stride
+
+        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
+            device=device, dtype=dtype)
+        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+
+        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
+            device=device, dtype=dtype)
+        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+
+        # (h, h_kv, 1)
+        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+        h_diff *= self.position_magnitude
+
+        # (w, w_kv, 1)
+        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+        w_diff *= self.position_magnitude
+
+        feat_range = torch.arange(0, feat_dim / 4).to(
+            device=device, dtype=dtype)
+
+        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
+        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+        dim_mat = dim_mat.view((1, 1, -1))
+
+        embedding_x = torch.cat(
+            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+
+        embedding_y = torch.cat(
+            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+
+        return embedding_x, embedding_y
+
+    def forward(self, x_input):
+        num_heads = self.num_heads
+
+        # use empirical_attention
+        if self.q_downsample is not None:
+            x_q = self.q_downsample(x_input)
+        else:
+            x_q = x_input
+        n, _, h, w = x_q.shape
+
+        if self.kv_downsample is not None:
+            x_kv = self.kv_downsample(x_input)
+        else:
+            x_kv = x_input
+        _, _, h_kv, w_kv = x_kv.shape
+
+        if self.attention_type[0] or self.attention_type[1]:
+            proj_query = self.query_conv(x_q).view(
+                (n, num_heads, self.qk_embed_dim, h * w))
+            proj_query = proj_query.permute(0, 1, 3, 2)
+
+        if self.attention_type[0] or self.attention_type[2]:
+            proj_key = self.key_conv(x_kv).view(
+                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+
+        if self.attention_type[1] or self.attention_type[3]:
+            position_embed_x, position_embed_y = self.get_position_embedding(
+                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+                x_input.device, x_input.dtype, self.position_embedding_dim)
+            # (n, num_heads, w, w_kv, dim)
+            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            # (n, num_heads, h, h_kv, dim)
+            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            position_feat_x /= math.sqrt(2)
+            position_feat_y /= math.sqrt(2)
+
+        # accelerate for saliency only
+        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+            appr_bias = self.appr_bias.\
+                view(1, num_heads, 1, self.qk_embed_dim).\
+                repeat(n, 1, 1, 1)
+
+            energy = torch.matmul(appr_bias, proj_key).\
+                view(n, num_heads, 1, h_kv * w_kv)
+
+            h = 1
+            w = 1
+        else:
+            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+            if not self.attention_type[0]:
+                energy = torch.zeros(
+                    n,
+                    num_heads,
+                    h,
+                    w,
+                    h_kv,
+                    w_kv,
+                    dtype=x_input.dtype,
+                    device=x_input.device)
+
+            # attention_type[0]: appr - appr
+            # attention_type[1]: appr - position
+            # attention_type[2]: bias - appr
+            # attention_type[3]: bias - position
+            if self.attention_type[0] or self.attention_type[2]:
+                if self.attention_type[0] and self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[0]:
+                    energy = torch.matmul(proj_query, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim).\
+                        repeat(n, 1, 1, 1)
+
+                    energy += torch.matmul(appr_bias, proj_key).\
+                        view(n, num_heads, 1, 1, h_kv, w_kv)
+
+            if self.attention_type[1] or self.attention_type[3]:
+                if self.attention_type[1] and self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+
+                    proj_query_reshape = (proj_query + geom_bias).\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(
+                        proj_query_reshape.permute(0, 1, 3, 2, 4),
+                        position_feat_x.permute(0, 1, 2, 4, 3))
+                    energy_x = energy_x.\
+                        permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(
+                        proj_query_reshape,
+                        position_feat_y.permute(0, 1, 2, 4, 3))
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[1]:
+                    proj_query_reshape = proj_query.\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    proj_query_reshape = proj_query_reshape.\
+                        permute(0, 1, 3, 2, 4)
+                    position_feat_x_reshape = position_feat_x.\
+                        permute(0, 1, 2, 4, 3)
+                    position_feat_y_reshape = position_feat_y.\
+                        permute(0, 1, 2, 4, 3)
+
+                    energy_x = torch.matmul(proj_query_reshape,
+                                            position_feat_x_reshape)
+                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(proj_query_reshape,
+                                            position_feat_y_reshape)
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, self.qk_embed_dim, 1).\
+                        repeat(n, 1, 1, 1)
+
+                    position_feat_x_reshape = position_feat_x.\
+                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
+
+                    position_feat_y_reshape = position_feat_y.\
+                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+
+                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+
+                    energy += energy_x + energy_y
+
+            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+
+        if self.spatial_range >= 0:
+            cur_local_constraint_map = \
+                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+                contiguous().\
+                view(1, 1, h*w, h_kv*w_kv)
+
+            energy = energy.masked_fill_(cur_local_constraint_map,
+                                         float('-inf'))
+
+        attention = F.softmax(energy, 3)
+
+        proj_value = self.value_conv(x_kv)
+        proj_value_reshape = proj_value.\
+            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+            permute(0, 1, 3, 2)
+
+        out = torch.matmul(attention, proj_value_reshape).\
+            permute(0, 1, 3, 2).\
+            contiguous().\
+            view(n, self.v_dim * self.num_heads, h, w)
+
+        out = self.proj_conv(out)
+
+        # output is downsampled, upsample back to input size
+        if self.q_downsample is not None:
+            out = F.interpolate(
+                out,
+                size=x_input.shape[2:],
+                mode='bilinear',
+                align_corners=False)
+
+        out = self.gamma * out + x_input
+        return out
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'kaiming_init') and m.kaiming_init:
+                kaiming_init(
+                    m,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    bias=0,
+                    distribution='uniform',
+                    a=1)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hsigmoid.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..adc169c039506d50f5d5970b3a52d606f6d91379
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hsigmoid.py
@@ -0,0 +1,59 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+
+from .registry import ACTIVATION_LAYERS
+
+
+@ACTIVATION_LAYERS.register_module()
+class HSigmoid(nn.Module):
+    """Hard Sigmoid Module. Apply the hard sigmoid function:
+    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
+    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+    Note:
+        In MMCV v1.4.4, we modified the default value of args to align with
+        PyTorch official.
+
+    Args:
+        bias (float): Bias of the input feature map. Default: 3.0.
+        divisor (float): Divisor of the input feature map. Default: 6.0.
+        min_value (float): Lower bound value. Default: 0.0.
+        max_value (float): Upper bound value. Default: 1.0.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self, bias=3.0, divisor=6.0, min_value=0.0, max_value=1.0):
+        super(HSigmoid, self).__init__()
+        warnings.warn(
+            'In MMCV v1.4.4, we modified the default value of args to align '
+            'with PyTorch official. Previous Implementation: '
+            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+            'Current Implementation: '
+            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+        self.bias = bias
+        self.divisor = divisor
+        assert self.divisor != 0
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def forward(self, x):
+        x = (x + self.bias) / self.divisor
+
+        return x.clamp_(self.min_value, self.max_value)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hswish.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..399abc65dec7792dd0112a4430e5c17fb6e43b4c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/hswish.py
@@ -0,0 +1,42 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+
+from .registry import ACTIVATION_LAYERS
+
+
+@ACTIVATION_LAYERS.register_module()
+class HSwish(nn.Module):
+    """Hard Swish Module.
+
+    This module applies the hard swish function:
+
+    .. math::
+        Hswish(x) = x * ReLU6(x + 3) / 6
+
+    Args:
+        inplace (bool): can optionally do the operation in-place.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self, inplace=False):
+        super(HSwish, self).__init__()
+        self.act = nn.ReLU6(inplace)
+
+    def forward(self, x):
+        return x * self.act(x + 3) / 6
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/non_local.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..2372866116dd9e3321976e73277cc68e4c7d0217
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/non_local.py
@@ -0,0 +1,319 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta
+
+import torch
+import torch.nn as nn
+
+from ..utils import constant_init, normal_init
+from .conv_module import ConvModule
+from .registry import PLUGIN_LAYERS
+
+
+class _NonLocalNd(nn.Module, metaclass=ABCMeta):
+    """Basic Non-local module.
+
+    This module is proposed in
+    "Non-local Neural Networks"
+    Paper reference: https://arxiv.org/abs/1711.07971
+    Code reference: https://github.com/AlexHex7/Non-local_pytorch
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
+            Default: True.
+        conv_cfg (None | dict): The config dict for convolution layers.
+            If not specified, it will use `nn.Conv2d` for convolution layers.
+            Default: None.
+        norm_cfg (None | dict): The config dict for normalization layers.
+            Default: None. (This parameter is only applicable to conv_out.)
+        mode (str): Options are `gaussian`, `concatenation`,
+            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 reduction=2,
+                 use_scale=True,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 mode='embedded_gaussian',
+                 **kwargs):
+        super(_NonLocalNd, self).__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.inter_channels = max(in_channels // reduction, 1)
+        self.mode = mode
+
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
+                             f"'embedded_gaussian' or 'dot_product', but got "
+                             f'{mode} instead.')
+
+        # g, theta, phi are defaulted as `nn.ConvNd`.
+        # Here we use ConvModule for potential usage.
+        self.g = ConvModule(
+            self.in_channels,
+            self.inter_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            act_cfg=None)
+        self.conv_out = ConvModule(
+            self.inter_channels,
+            self.in_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if self.mode != 'gaussian':
+            self.theta = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+            self.phi = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+
+        if self.mode == 'concatenation':
+            self.concat_project = ConvModule(
+                self.inter_channels * 2,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                act_cfg=dict(type='ReLU'))
+
+        self.init_weights(**kwargs)
+
+    def init_weights(self, std=0.01, zeros_init=True):
+        if self.mode != 'gaussian':
+            for m in [self.g, self.theta, self.phi]:
+                normal_init(m.conv, std=std)
+        else:
+            normal_init(self.g.conv, std=std)
+        if zeros_init:
+            if self.conv_out.norm_cfg is None:
+                constant_init(self.conv_out.conv, 0)
+            else:
+                constant_init(self.conv_out.norm, 0)
+        else:
+            if self.conv_out.norm_cfg is None:
+                normal_init(self.conv_out.conv, std=std)
+            else:
+                normal_init(self.conv_out.norm, std=std)
+
+    def gaussian(self, theta_x, phi_x):
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x, phi_x):
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x, phi_x):
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        h = theta_x.size(2)
+        w = phi_x.size(3)
+        theta_x = theta_x.repeat(1, 1, 1, w)
+        phi_x = phi_x.repeat(1, 1, h, 1)
+
+        concat_feature = torch.cat([theta_x, phi_x], dim=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.size()
+        pairwise_weight = pairwise_weight.view(n, h, w)
+        pairwise_weight /= pairwise_weight.shape[-1]
+
+        return pairwise_weight
+
+    def forward(self, x):
+        # Assume `reduction = 1`, then `inter_channels = C`
+        # or `inter_channels = C` when `mode="gaussian"`
+
+        # NonLocal1d x: [N, C, H]
+        # NonLocal2d x: [N, C, H, W]
+        # NonLocal3d x: [N, C, T, H, W]
+        n = x.size(0)
+
+        # NonLocal1d g_x: [N, H, C]
+        # NonLocal2d g_x: [N, HxW, C]
+        # NonLocal3d g_x: [N, TxHxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
+        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        pairwise_func = getattr(self, self.mode)
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # NonLocal1d y: [N, H, C]
+        # NonLocal2d y: [N, HxW, C]
+        # NonLocal3d y: [N, TxHxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # NonLocal1d y: [N, C, H]
+        # NonLocal2d y: [N, C, H, W]
+        # NonLocal3d y: [N, C, T, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        output = x + self.conv_out(y)
+
+        return output
+
+
+class NonLocal1d(_NonLocalNd):
+    """1D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv1d').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 **kwargs):
+        super(NonLocal1d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+@PLUGIN_LAYERS.register_module()
+class NonLocal2d(_NonLocalNd):
+    """2D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv2d').
+    """
+
+    _abbr_ = 'nonlocal_block'
+
+    def __init__(self,
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv2d'),
+                 **kwargs):
+        super(NonLocal2d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+class NonLocal3d(_NonLocalNd):
+    """3D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv3d').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv3d'),
+                 **kwargs):
+        super(NonLocal3d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/norm.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a11604c3f0f4317dddd68ed3abc0af15f18190d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/norm.py
@@ -0,0 +1,157 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+
+import torch.nn as nn
+
+from mmcv.utils import is_tuple_of
+from mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
+from .registry import NORM_LAYERS
+
+NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
+NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
+NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
+NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
+NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
+NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
+NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    When we build a norm layer with `build_norm_layer()`, we want to preserve
+    the norm type in variable names, e.g, self.bn1, self.gn. This method will
+    infer the abbreviation to map class types to abbreviations.
+
+    Rule 1: If the class has the property "_abbr_", return the property.
+    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+    "in" respectively.
+    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+    respectively.
+    Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
+        return 'in'
+    elif issubclass(class_type, _BatchNorm):
+        return 'bn'
+    elif issubclass(class_type, nn.GroupNorm):
+        return 'gn'
+    elif issubclass(class_type, nn.LayerNorm):
+        return 'ln'
+    else:
+        class_name = class_type.__name__.lower()
+        if 'batch' in class_name:
+            return 'bn'
+        elif 'group' in class_name:
+            return 'gn'
+        elif 'layer' in class_name:
+            return 'ln'
+        elif 'instance' in class_name:
+            return 'in'
+        else:
+            return 'norm_layer'
+
+
+def build_norm_layer(cfg, num_features, postfix=''):
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+
+    Returns:
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in NORM_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+
+    norm_layer = NORM_LAYERS.get(layer_type)
+    abbr = infer_abbr(norm_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+def is_norm(layer, exclude=None):
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type | tuple[type]): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/padding.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a844d78b25afbce3cd0b66cde152037e1585bd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/padding.py
@@ -0,0 +1,49 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+
+from .registry import PADDING_LAYERS
+
+PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
+PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
+PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg, *args, **kwargs):
+    """Build padding layer.
+
+    Args:
+        cfg (None or dict): The padding layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a padding layer.
+
+    Returns:
+        nn.Module: Created padding layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+
+    cfg_ = cfg.copy()
+    padding_type = cfg_.pop('type')
+    if padding_type not in PADDING_LAYERS:
+        raise KeyError(f'Unrecognized padding type {padding_type}.')
+    else:
+        padding_layer = PADDING_LAYERS.get(padding_type)
+
+    layer = padding_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/plugin.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..26085fd6e83bf5d40d496f21a4eb98dc6e0129c1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/plugin.py
@@ -0,0 +1,102 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import platform
+
+from .registry import PLUGIN_LAYERS
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    This method will infer the abbreviation to map class types to
+    abbreviations.
+
+    Rule 1: If the class has the property "abbr", return the property.
+    Rule 2: Otherwise, the abbreviation falls back to snake case of class
+    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+
+    def camel2snack(word):
+        """Convert camel case word into snack case.
+
+        Modified from `inflection lib
+        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
+
+        Example::
+
+            >>> camel2snack("FancyBlock")
+            'fancy_block'
+        """
+
+        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+        word = word.replace('-', '_')
+        return word.lower()
+
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    else:
+        return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg, postfix='', **kwargs):
+    """Build plugin layer.
+
+    Args:
+        cfg (None or dict): cfg should contain:
+
+            - type (str): identify plugin layer type.
+            - layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer. Default: ''.
+
+    Returns:
+        tuple[str, nn.Module]: The first one is the concatenation of
+        abbreviation and postfix. The second is the created plugin layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in PLUGIN_LAYERS:
+        raise KeyError(f'Unrecognized plugin type {layer_type}')
+
+    plugin_layer = PLUGIN_LAYERS.get(layer_type)
+    abbr = infer_abbr(plugin_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    layer = plugin_layer(**kwargs, **cfg_)
+
+    return name, layer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/registry.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea0d81266a5513eb37c5cd3dceda40efa1437132
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/registry.py
@@ -0,0 +1,29 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.utils import Registry
+
+CONV_LAYERS = Registry('conv layer')
+NORM_LAYERS = Registry('norm layer')
+ACTIVATION_LAYERS = Registry('activation layer')
+PADDING_LAYERS = Registry('padding layer')
+UPSAMPLE_LAYERS = Registry('upsample layer')
+PLUGIN_LAYERS = Registry('plugin layer')
+
+DROPOUT_LAYERS = Registry('drop out layers')
+POSITIONAL_ENCODING = Registry('position encoding')
+ATTENTION = Registry('attention')
+FEEDFORWARD_NETWORK = Registry('feed-forward Network')
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/scale.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc237a901454435479bb1ec53707b1b772790128
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/scale.py
@@ -0,0 +1,34 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """A learnable scale parameter.
+
+    This layer scales the input by a learnable factor. It multiplies a
+    learnable scale parameter of shape (1,) with input of any shape.
+
+    Args:
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/swish.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f0cff2f830582a65a2a03880ce9a3cc950b4b0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/swish.py
@@ -0,0 +1,38 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+from .registry import ACTIVATION_LAYERS
+
+
+@ACTIVATION_LAYERS.register_module()
+class Swish(nn.Module):
+    """Swish Module.
+
+    This module applies the swish function:
+
+    .. math::
+        Swish(x) = x * Sigmoid(x)
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self):
+        super(Swish, self).__init__()
+
+    def forward(self, x):
+        return x * torch.sigmoid(x)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/transformer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d8b904edfbb0083c9ceff2eff00a8357328c5b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/transformer.py
@@ -0,0 +1,956 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+from .drop import build_dropout
+from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
+
+
+def build_attention(cfg, default_args=None):
+    """Builder for attention."""
+    return build_from_cfg(cfg, ATTENTION, default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+    """Builder for feed-forward network (FFN)."""
+    return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+    """Builder for transformer layer."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+    """Builder for transformer encoder and transformer decoder."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super(AdaptivePadding, self).__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+@ATTENTION.register_module()
+class MultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super(MultiheadAttention, self).__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@FEEDFORWARD_NETWORK.register_module()
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='FFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 **kwargs):
+        super(FFN, self).__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                Sequential(
+                    Linear(in_channels, feedforward_channels), self.activate,
+                    nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@TRANSFORMER_LAYER.register_module()
+class BaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(BaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class TransformerLayerSequence(BaseModule):
+    """Base class for TransformerEncoder and TransformerDecoder in vision
+    transformer.
+
+    As base-class of Encoder and Decoder in vision transformer.
+    Support customization such as specifying different kind
+    of `transformer_layer` in `transformer_coder`.
+
+    Args:
+        transformerlayer (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict`): Config of transformerlayer
+            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+             it would be repeated `num_layer` times to a
+             list[`mmcv.ConfigDict`]. Default: None.
+        num_layers (int): The number of `TransformerLayer`. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+        super(TransformerLayerSequence, self).__init__(init_cfg)
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers) for _ in range(num_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+
+    def forward(self,
+                query,
+                key,
+                value,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        for layer in self.layers:
+            query = layer(
+                query,
+                key,
+                value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+        return query
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/upsample.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..081d22c86a511d0372195e3d40207624bf3cb6c3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/upsample.py
@@ -0,0 +1,97 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import xavier_init
+from .registry import UPSAMPLE_LAYERS
+
+UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
+UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
+
+
+@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
+class PixelShufflePack(nn.Module):
+    """Pixel Shuffle upsample layer.
+
+    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
+    achieve a simple upsampling with pixel shuffle.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Upsample ratio.
+        upsample_kernel (int): Kernel size of the conv layer to expand the
+            channels.
+    """
+
+    def __init__(self, in_channels, out_channels, scale_factor,
+                 upsample_kernel):
+        super(PixelShufflePack, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scale_factor = scale_factor
+        self.upsample_kernel = upsample_kernel
+        self.upsample_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels * scale_factor * scale_factor,
+            self.upsample_kernel,
+            padding=(self.upsample_kernel - 1) // 2)
+        self.init_weights()
+
+    def init_weights(self):
+        xavier_init(self.upsample_conv, distribution='uniform')
+
+    def forward(self, x):
+        x = self.upsample_conv(x)
+        x = F.pixel_shuffle(x, self.scale_factor)
+        return x
+
+
+def build_upsample_layer(cfg, *args, **kwargs):
+    """Build upsample layer.
+
+    Args:
+        cfg (dict): The upsample layer config, which should contain:
+
+            - type (str): Layer type.
+            - scale_factor (int): Upsample ratio, which is not applicable to
+              deconv.
+            - layer args: Args needed to instantiate a upsample layer.
+        args (argument list): Arguments passed to the ``__init__``
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the
+            ``__init__`` method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created upsample layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        raise KeyError(
+            f'the cfg dict must contain the key "type", but got {cfg}')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in UPSAMPLE_LAYERS:
+        raise KeyError(f'Unrecognized upsample type {layer_type}')
+    else:
+        upsample = UPSAMPLE_LAYERS.get(layer_type)
+
+    if upsample is nn.Upsample:
+        cfg_['mode'] = layer_type
+    layer = upsample(*args, **kwargs, **cfg_)
+    return layer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/wrappers.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8df7dc51d4941dbe460add9d10464bc7ecb9331
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/bricks/wrappers.py
@@ -0,0 +1,193 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair, _triple
+
+from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold):
+    return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@CONV_LAYERS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv')
+@UPSAMPLE_LAYERS.register_module('deconv', force=True)
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv3d')
+@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+                                     _triple(self.padding),
+                                     _triple(self.stride),
+                                     _triple(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+    def forward(self, x):
+        # empty tensor forward of Linear layer is supported in Pytorch 1.6
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d37b06f64b7aba94f5315b01261af50e8ea4a9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/builder.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..runner import Sequential
+from ..utils import Registry, build_from_cfg
+
+
+def build_model_from_cfg(cfg, registry, default_args=None):
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, is is either a config
+            dict or a list of config dicts. If cfg is a list, a
+            the built modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn module.
+    """
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+MODELS = Registry('model', build_func=build_model_from_cfg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/resnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..584c1718d33070e7dede8defe9369106d6652b96
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/resnet.py
@@ -0,0 +1,329 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+
+from .utils import constant_init, kaiming_init
+
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
+        super(BasicBlock, self).__init__()
+        assert style in ['pytorch', 'caffe']
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
+        """Bottleneck block.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='pytorch',
+                   with_cp=False):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for _ in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 with_cp=False):
+        super(ResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert num_stages >= 1 and num_stages <= 4
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]
+        assert len(strides) == len(dilations) == num_stages
+        assert max(out_indices) < num_stages
+
+        self.out_indices = out_indices
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+        self.with_cp = with_cp
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, f'layer{i}')
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd99a47b99bc1fd0337f3d03bd8666eb19fd197a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/__init__.py
@@ -0,0 +1,32 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .flops_counter import get_model_complexity_info
+from .fuse_conv_bn import fuse_conv_bn
+from .sync_bn import revert_sync_batchnorm
+from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, xavier_init)
+
+__all__ = [
+    'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
+    'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
+    'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'revert_sync_batchnorm'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/flops_counter.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda698ed9acfbcbcb4fb42397fbcf7ea00733914
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/flops_counter.py
@@ -0,0 +1,589 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import warnings
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+def get_model_complexity_info(model,
+                              input_shape,
+                              print_per_layer_stat=True,
+                              as_strings=True,
+                              input_constructor=None,
+                              flush=False,
+                              ost=sys.stdout):
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+          ``nn.LeakyReLU``, ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+        FLOPs and parameter counts in a string format. otherwise, it will
+        return those in a float number format.
+    """
+    assert type(input_shape) is tuple
+    assert len(input_shape) >= 1
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, *input_shape))
+
+        _ = flops_model(batch)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops, units='GFLOPs', precision=2):
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params, units=None, precision=2):
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model,
+                           total_flops,
+                           total_params,
+                           units='GFLOPs',
+                           precision=3,
+                           ost=sys.stdout,
+                           flush=False):
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            '{:.3%} Params'.format(accumulated_num_params / total_params),
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model):
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module):
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self):
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self):
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module):
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self):
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self):
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module, input, output):
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module, input, output):
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module, input, output):
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module, input, output):
+    input = input[0]
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module, input, output):
+    input = input[0]
+    module.__flops__ += int(np.prod(input.shape))
+
+
+def norm_flops_counter_hook(module, input, output):
+    input = input[0]
+
+    batch_flops = np.prod(input.shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    input_height, input_width = input.shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_height
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module, input, output):
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        input = input[0]
+        batch_size = len(input)
+    else:
+        warnings.warn('No positional inputs found for a module, '
+                      'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module):
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            warnings.warn('variables __flops__ or __params__ are already '
+                          'defined for the module' + type(module).__name__ +
+                          ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module):
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping():
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
+    }
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/fuse_conv_bn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa39d248bf40d093c598535c072e2cb3b2578976
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/fuse_conv_bn.py
@@ -0,0 +1,72 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv, bn):
+    """Fuse conv and bn into one module.
+
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module):
+    """Recursively fuse conv and bn in a module.
+
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+
+    Args:
+        module (nn.Module): Module to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/sync_bn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cd54b89a146169bb67667b97354c5de8d2e8eaf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/sync_bn.py
@@ -0,0 +1,73 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+import mmcv
+
+
+class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
+    """A general BatchNorm layer without input dimension check.
+
+    Reproduced from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+    is `_check_input_dim` that is designed for tensor sanity checks.
+    The check has been bypassed in this class for the convenience of converting
+    SyncBatchNorm.
+    """
+
+    def _check_input_dim(self, input):
+        return
+
+
+def revert_sync_batchnorm(module):
+    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
+    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
+    `BatchNormXd` layers.
+
+    Adapted from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+
+    Returns:
+        module_output: The converted module with `BatchNormXd` layers.
+    """
+    module_output = module
+    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
+    if hasattr(mmcv, 'ops'):
+        module_checklist.append(mmcv.ops.SyncBatchNorm)
+    if isinstance(module, tuple(module_checklist)):
+        module_output = _BatchNormXd(module.num_features, module.eps,
+                                     module.momentum, module.affine,
+                                     module.track_running_stats)
+        if module.affine:
+            # no_grad() may not be needed here but
+            # just to be consistent with `convert_sync_batchnorm()`
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        module_output.training = module.training
+        # qconfig exists in quantized models
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        module_output.add_module(name, revert_sync_batchnorm(child))
+    del module
+    return module_output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/weight_init.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..51077db352fc3ed9de5f9373c3236da5b9645544
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/utils/weight_init.py
@@ -0,0 +1,698 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmcv.utils import Registry, build_from_cfg, get_logger, print_log
+
+INITIALIZERS = Registry('initializer')
+
+
+def update_init_info(module, init_info):
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m):
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit(object):
+
+    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val, **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, gain=1, distribution='normal', **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self, mean=0, std=1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a=0, b=1, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 distribution='normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module):
+        super().__call__(module)
+
+
+@INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit(object):
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations.
+    """
+
+    def __init__(self, checkpoint, prefix=None, map_location=None):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module):
+        from mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint,
+                                 load_state_dict)
+        logger = get_logger('mmcv')
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger=logger)
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger=logger)
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger=logger)
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger=logger)
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module, cfg, wholemodule=False):
+    func = build_from_cfg(cfg, INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module, override, cfg):
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module, init_cfg):
+    r"""Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/vgg.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..55ee30d247e4a062eac0506ec67a11c41f16c38c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/cnn/vgg.py
@@ -0,0 +1,188 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import torch.nn as nn
+
+from .utils import constant_init, kaiming_init, normal_init
+
+
+def conv3x3(in_planes, out_planes, dilation=1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        padding=dilation,
+        dilation=dilation)
+
+
+def make_vgg_layer(inplanes,
+                   planes,
+                   num_blocks,
+                   dilation=1,
+                   with_bn=False,
+                   ceil_mode=False):
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(conv3x3(inplanes, planes, dilation))
+        if with_bn:
+            layers.append(nn.BatchNorm2d(planes))
+        layers.append(nn.ReLU(inplace=True))
+        inplanes = planes
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+class VGG(nn.Module):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_bn (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+    """
+
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth,
+                 with_bn=False,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3, 4),
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super(VGG, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+        assert max(out_indices) <= num_stages
+
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+
+        self.inplanes = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks * (2 + with_bn) + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            planes = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.inplanes,
+                planes,
+                num_blocks,
+                dilation=dilation,
+                with_bn=with_bn,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.inplanes = planes
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(VGG, self).train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        vgg_layers = getattr(self, self.module_name)
+        if mode and self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                for j in range(*self.range_sub_modules[i]):
+                    mod = vgg_layers[j]
+                    mod.eval()
+                    for param in mod.parameters():
+                        param.requires_grad = False
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0730e959a9298d679a3097736431e51094bcf408
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/__init__.py
@@ -0,0 +1,21 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
+                   single_gpu_test)
+
+__all__ = [
+    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
+    'single_gpu_test'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/test.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eaf9fb4715011ce7687e6c4aa71dc099a1a3d0e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/engine/test.py
@@ -0,0 +1,215 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+
+import mmcv
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        # Assume result has the same length of batch_size
+        # refer to https://github.com/open-mmlab/mmcv/issues/985
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting
+    ``gpu_collect=True``, it encodes results to gpu tensors and use gpu
+    communication for results collection. On cpu mode it saves the results on
+    different gpus to ``tmpdir`` and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result)
+            batch_size_all = batch_size * world_size
+            if batch_size_all + prog_bar.completed > len(dataset):
+                batch_size_all = len(dataset) - prog_bar.completed
+            for _ in range(batch_size_all):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_result = mmcv.load(part_file)
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results under gpu mode.
+
+    On gpu mode, this function will encode results to gpu tensors and use gpu
+    communication for results collection.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..21a683db00d98219e9df878cb3511aa53181c5d1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .file_client import BaseStorageBackend, FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+from .io import dump, load, register_handler
+from .parse import dict_from_file, list_from_file
+
+__all__ = [
+    'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
+    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
+    'list_from_file', 'dict_from_file'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/file_client.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/file_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd7036be73144b1163241a00a58d5d297ca980ad
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/file_client.py
@@ -0,0 +1,1162 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+import os.path as osp
+import re
+import tempfile
+import warnings
+from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Iterable, Iterator, Optional, Tuple, Union
+from urllib.request import urlopen
+
+import mmcv
+from mmcv.utils.misc import has_method
+from mmcv.utils.path import is_filepath
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: ``get()`` and ``get_text()``.
+    ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+    as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    _allow_symlink = False
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @property
+    def allow_symlink(self):
+        return self._allow_symlink
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+
+class CephBackend(BaseStorageBackend):
+    """Ceph storage backend (for internal use).
+
+    Args:
+        path_mapping (dict|None): path mapping dict from local path to Petrel
+            path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
+            will be replaced by ``dst``. Default: None.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+    """
+
+    def __init__(self, path_mapping=None):
+        try:
+            import ceph
+        except ImportError:
+            raise ImportError('Please install ceph to enable CephBackend.')
+
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
+        self._client = ceph.S3Client()
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal use).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Default: None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Default: True.
+
+    Examples:
+        >>> filepath1 = 's3://path/of/file'
+        >>> filepath2 = 'cluster-name:s3://path/of/file'
+        >>> client = PetrelBackend()
+        >>> client.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def get(self, filepath: Union[str, Path]) -> memoryview:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            memoryview: A memory view of expected bytes object to avoid
+                copying. The memoryview object can be converted to bytes by
+                ``value_buf.tobytes()``.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Default: 'utf-8'.
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `delete` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.delete(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` and `isdir` methods, please use a higher'
+                 'version or dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `isdir` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` method, please use a higher version or '
+                 'dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_paths.append(self._format_path(self._map_path(path)))
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download a file from ``filepath`` and return a temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str | Path): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('s3://path/of/your/file') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `list` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str | None): Additional path to be appended to `sys.path`.
+            Default: None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool, optional): Lmdb environment parameter. If True,
+            disallow any write operations. Default: True.
+        lock (bool, optional): Lmdb environment parameter. If False, when
+            concurrent access occurs, do not lock the database. Default: False.
+        readahead (bool, optional): Lmdb environment parameter. If False,
+            disable the OS filesystem readahead mechanism, which may improve
+            random read performance when a database is larger than RAM.
+            Default: False.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb
+        except ImportError:
+            raise ImportError('Please install lmdb to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self._client = lmdb.open(
+            self.db_path,
+            readonly=readonly,
+            lock=lock,
+            readahead=readahead,
+            **kwargs)
+
+    def get(self, filepath):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+        """
+        filepath = str(filepath)
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(filepath.encode('ascii'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, 'r', encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        os.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
+        """Only for unified API and do nothing."""
+        yield filepath
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath):
+        value_buf = urlopen(filepath).read()
+        return value_buf
+
+    def get_text(self, filepath, encoding='utf-8'):
+        value_buf = urlopen(filepath).read()
+        return value_buf.decode(encoding)
+
+    @contextmanager
+    def get_local_path(self, filepath: str) -> Iterable[str]:
+        """Download a file from ``filepath``.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('http://path/of/your/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Default: None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'ceph': CephBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+    # This collection is used to record the overridden backends, and when a
+    # backend appears in the collection, the singleton pattern is disabled for
+    # that backend, because if the singleton pattern is used, then the object
+    # returned will be the backend before overwriting
+    _overridden_backends = set()
+    _prefix_to_backends = {
+        's3': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+    _overridden_prefixes = set()
+
+    _instances = {}
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        # if a backend was overridden, it will create a new object
+        if (arg_key in cls._instances
+                and backend not in cls._overridden_backends
+                and prefix not in cls._overridden_prefixes):
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Default: None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Default: None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            cls._overridden_backends.add(name)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    cls._overridden_prefixes.add(prefix)
+                    cls._prefix_to_backends[prefix] = backend
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Default: None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+            bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Default: 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54d3630512558b2a8951a652b33d6a42127ae56
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/__init__.py
@@ -0,0 +1,20 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .yaml_handler import YamlHandler
+
+__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/base.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c8a536a12da51eb66194d370ff0cb0156b8d50
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/base.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath, mode='r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/json_handler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/json_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..713ba0db5633f9e0f4099527aa479dd939b2deb0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/json_handler.py
@@ -0,0 +1,49 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/pickle_handler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..509885247ed3f3203dd9caf0544e18fd6c96664d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/pickle_handler.py
@@ -0,0 +1,41 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super(PickleHandler, self).load_from_path(
+            filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super(PickleHandler, self).dump_to_path(
+            obj, filepath, mode='wb', **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/yaml_handler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/yaml_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d2aacb93549d18797f3d081420fffd3fcd9a4e0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/handlers/yaml_handler.py
@@ -0,0 +1,37 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml
+
+try:
+    from yaml import CLoader as Loader, CDumper as Dumper
+except ImportError:
+    from yaml import Loader, Dumper
+
+from .base import BaseFileHandler  # isort:skip
+
+
+class YamlHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault('Loader', Loader)
+        return yaml.load(file, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        yaml.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        return yaml.dump(obj, **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/io.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..7536d1839bcd3ade037aaf07be3623ff14cac19d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/io.py
@@ -0,0 +1,164 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from io import BytesIO, StringIO
+from pathlib import Path
+
+from ..utils import is_list_of, is_str
+from .file_client import FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+
+file_handlers = {
+    'json': JsonHandler(),
+    'yaml': YamlHandler(),
+    'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler()
+}
+
+
+def load(file, file_format=None, file_client_args=None, **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    Note:
+        In v1.3.16 and later, ``load`` supports loading data from serialized
+        files those can be storaged in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and is_str(file):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO(file_client.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_client.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    Note:
+        In v1.3.16 and later, ``dump`` supports dumping data as strings or to
+        files which is saved to different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if is_str(file):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
+
+
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/parse.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..487ee19563bf170a722b28f47209ec1b4bb02872
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/fileio/parse.py
@@ -0,0 +1,110 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from io import StringIO
+
+from .file_client import FileClient
+
+
+def list_from_file(filename,
+                   prefix='',
+                   offset=0,
+                   max_num=0,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a list of strings.
+
+    Note:
+        In v1.3.16 and later, ``list_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    cnt = 0
+    item_list = []
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename,
+                   key_type=str,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    Note:
+        In v1.3.16 and later, ``dict_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    mapping = {}
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8faf6456e7ac9f17ae511df7efe870d475b75b4f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/__init__.py
@@ -0,0 +1,41 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+                        impad_to_multiple, imrescale, imresize, imresize_like,
+                        imresize_to_multiple, imrotate, imshear, imtranslate,
+                        rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .misc import tensor2imgs
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+                          adjust_lighting, adjust_sharpness, auto_contrast,
+                          clahe, imdenormalize, imequalize, iminvert,
+                          imnormalize, imnormalize_, lut_transform, posterize,
+                          solarize)
+
+__all__ = [
+    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/colorspace.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1289252008d1ebb41551998d5fc590ea7e57ce6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/colorspace.py
@@ -0,0 +1,319 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+
+
+def imconvert(img, src, dst):
+    """Convert an image from the src colorspace to dst colorspace.
+
+    Args:
+        img (ndarray): The input image.
+        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+    Returns:
+        ndarray: The converted image.
+    """
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+    out_img = cv2.cvtColor(img, code)
+    return out_img
+
+
+def bgr2gray(img, keepdim=False):
+    """Convert a BGR image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def rgb2gray(img, keepdim=False):
+    """Convert a RGB image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def gray2bgr(img):
+    """Convert a grayscale image to BGR image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted BGR image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return out_img
+
+
+def gray2rgb(img):
+    """Convert a grayscale image to RGB image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted RGB image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    return out_img
+
+
+def _convert_input_type_range(img):
+    """Convert the type and range of the input image.
+
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError('The img type should be np.float32 or np.uint8, '
+                        f'but got {img_type}')
+    return img
+
+
+def _convert_output_type_range(img, dst_type):
+    """Convert the type and range of the image according to dst_type.
+
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError('The dst_type should be np.float32 or np.uint8, '
+                        f'but got {dst_type}')
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def rgb2ycbcr(img, y_only=False):
+    """Convert a RGB image to YCbCr image.
+
+    This function produces the same results as Matlab's `rgb2ycbcr` function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def bgr2ycbcr(img, y_only=False):
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2rgb(img):
+    """Convert a YCbCr image to RGB image.
+
+    This function produces the same results as Matlab's ycbcr2rgb function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted RGB image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0, -0.00153632, 0.00791071],
+                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
+                                  -222.921, 135.576, -276.836
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2bgr(img):
+    """Convert a YCbCr image to BGR image.
+
+    The bgr version of ycbcr2rgb.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted BGR image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0.00791071, -0.00153632, 0],
+                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
+                                  -276.836, 135.576, -222.921
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def convert_color_factory(src, dst):
+
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+    def convert_color(img):
+        out_img = cv2.cvtColor(img, code)
+        return out_img
+
+    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+        image.
+
+    Args:
+        img (ndarray or str): The input image.
+
+    Returns:
+        ndarray: The converted {dst.upper()} image.
+    """
+
+    return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/geometric.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..af84f926ca3a6a5b9cc882ffb15148b4dabd5473
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/geometric.py
@@ -0,0 +1,741 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+
+import cv2
+import numpy as np
+
+from ..utils import to_2tuple
+from .io import imread_backend
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+if Image is not None:
+    pillow_interp_codes = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+def imresize_to_multiple(img,
+                         divisor,
+                         size=None,
+                         scale_factor=None,
+                         keep_ratio=False,
+                         return_scale=False,
+                         interpolation='bilinear',
+                         out=None,
+                         backend=None):
+    """Resize image according to a given size or scale factor and then rounds
+    up the the resized or rescaled image size to the nearest value that can be
+    divided by the divisor.
+
+    Args:
+        img (ndarray): The input image.
+        divisor (int | tuple): Resized image size will be a multiple of
+            divisor. If divisor is a tuple, divisor should be
+            (w_divisor, h_divisor).
+        size (None | int | tuple[int]): Target size (w, h). Default: None.
+        scale_factor (None | float | tuple[float]): Multiplier for spatial
+            size. Should match input size if it is a tuple and the 2D style is
+            (w_scale_factor, h_scale_factor). Default: None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: False.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if size is not None and scale_factor is not None:
+        raise ValueError('only one of size or scale_factor should be defined')
+    elif size is None and scale_factor is None:
+        raise ValueError('one of size or scale_factor should be defined')
+    elif size is not None:
+        size = to_2tuple(size)
+        if keep_ratio:
+            size = rescale_size((w, h), size, return_scale=False)
+    else:
+        size = _scale_size((w, h), scale_factor)
+
+    divisor = to_2tuple(divisor)
+    size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
+    resized_img, w_scale, h_scale = imresize(
+        img,
+        size,
+        return_scale=True,
+        interpolation=interpolation,
+        out=out,
+        backend=backend)
+    if return_scale:
+        return resized_img, w_scale, h_scale
+    else:
+        return resized_img
+
+
+def imresize_like(img,
+                  dst_img,
+                  return_scale=False,
+                  interpolation='bilinear',
+                  backend=None):
+    """Resize image to the same size of a given image.
+
+    Args:
+        img (ndarray): The input image.
+        dst_img (ndarray): The target image.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = dst_img.shape[:2]
+    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(img,
+              scale,
+              return_scale=False,
+              interpolation='bilinear',
+              backend=None):
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imflip(img, direction='horizontal'):
+    """Flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image.
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return np.flip(img, axis=1)
+    elif direction == 'vertical':
+        return np.flip(img, axis=0)
+    else:
+        return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def imrotate(img,
+             angle,
+             center=None,
+             scale=1.0,
+             border_value=0,
+             interpolation='bilinear',
+             auto_bound=False):
+    """Rotate an image.
+
+    Args:
+        img (ndarray): Image to be rotated.
+        angle (float): Rotation angle in degrees, positive values mean
+            clockwise rotation.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used.
+        scale (float): Isotropic scale factor.
+        border_value (int): Border value.
+        interpolation (str): Same as :func:`resize`.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image.
+
+    Returns:
+        ndarray: The rotated image.
+    """
+    if center is not None and auto_bound:
+        raise ValueError('`auto_bound` conflicts with `center`')
+    h, w = img.shape[:2]
+    if center is None:
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+    assert isinstance(center, tuple)
+
+    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+    if auto_bound:
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        matrix[0, 2] += (new_w - w) * 0.5
+        matrix[1, 2] += (new_h - h) * 0.5
+        w = int(np.round(new_w))
+        h = int(np.round(new_h))
+    rotated = cv2.warpAffine(
+        img,
+        matrix, (w, h),
+        flags=cv2_interp_codes[interpolation],
+        borderValue=border_value)
+    return rotated
+
+
+def bbox_clip(bboxes, img_shape):
+    """Clip bboxes to fit the image shape.
+
+    Args:
+        bboxes (ndarray): Shape (..., 4*k)
+        img_shape (tuple[int]): (height, width) of the image.
+
+    Returns:
+        ndarray: Clipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+    cmin[0::2] = img_shape[1] - 1
+    cmin[1::2] = img_shape[0] - 1
+    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+    return clipped_bboxes
+
+
+def bbox_scaling(bboxes, scale, clip_shape=None):
+    """Scaling bboxes w.r.t the box center.
+
+    Args:
+        bboxes (ndarray): Shape(..., 4).
+        scale (float): Scaling factor.
+        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+            boundary will be clipped according to the given shape (h, w).
+
+    Returns:
+        ndarray: Scaled bboxes.
+    """
+    if float(scale) == 1.0:
+        scaled_bboxes = bboxes.copy()
+    else:
+        w = bboxes[..., 2] - bboxes[..., 0] + 1
+        h = bboxes[..., 3] - bboxes[..., 1] + 1
+        dw = (w * (scale - 1)) * 0.5
+        dh = (h * (scale - 1)) * 0.5
+        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+    if clip_shape is not None:
+        return bbox_clip(scaled_bboxes, clip_shape)
+    else:
+        return scaled_bboxes
+
+
+def imcrop(img, bboxes, scale=1.0, pad_fill=None):
+    """Crop image patches.
+
+    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+    Args:
+        img (ndarray): Image to be cropped.
+        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+        scale (float, optional): Scale ratio of bboxes, the default value
+            1.0 means no padding.
+        pad_fill (Number | list[Number]): Value to be filled for padding.
+            Default: None, which means no padding.
+
+    Returns:
+        list[ndarray] | ndarray: The cropped image patches.
+    """
+    chn = 1 if img.ndim == 2 else img.shape[2]
+    if pad_fill is not None:
+        if isinstance(pad_fill, (int, float)):
+            pad_fill = [pad_fill for _ in range(chn)]
+        assert len(pad_fill) == chn
+
+    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+    patches = []
+    for i in range(clipped_bbox.shape[0]):
+        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+        if pad_fill is None:
+            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+        else:
+            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+            if chn == 1:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
+            else:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
+            patch = np.array(
+                pad_fill, dtype=img.dtype) * np.ones(
+                    patch_shape, dtype=img.dtype)
+            x_start = 0 if _x1 >= 0 else -_x1
+            y_start = 0 if _y1 >= 0 else -_y1
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            patch[y_start:y_start + h, x_start:x_start + w,
+                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
+        patches.append(patch)
+
+    if bboxes.ndim == 1:
+        return patches[0]
+    else:
+        return patches
+
+
+def impad(img,
+          *,
+          shape=None,
+          padding=None,
+          pad_val=0,
+          padding_mode='constant'):
+    """Pad the given image to a certain shape or pad on all sides with
+    specified padding mode and padding value.
+
+    Args:
+        img (ndarray): Image to be padded.
+        shape (tuple[int]): Expected padding shape (h, w). Default: None.
+        padding (int or tuple[int]): Padding on each border. If a single int is
+            provided this is used to pad all borders. If tuple of length 2 is
+            provided this is the padding on left/right and top/bottom
+            respectively. If a tuple of length 4 is provided this is the
+            padding for the left, top, right and bottom borders respectively.
+            Default: None. Note that `shape` and `padding` can not be both
+            set.
+        pad_val (Number | Sequence[Number]): Values to be filled in padding
+            areas when padding_mode is 'constant'. Default: 0.
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Default: constant.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        ndarray: The padded image.
+    """
+
+    assert (shape is not None) ^ (padding is not None)
+    if shape is not None:
+        padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
+
+    # check pad_val
+    if isinstance(pad_val, tuple):
+        assert len(pad_val) == img.shape[-1]
+    elif not isinstance(pad_val, numbers.Number):
+        raise TypeError('pad_val must be a int or a tuple. '
+                        f'But received {type(pad_val)}')
+
+    # check padding
+    if isinstance(padding, tuple) and len(padding) in [2, 4]:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+    elif isinstance(padding, numbers.Number):
+        padding = (padding, padding, padding, padding)
+    else:
+        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                         f'But received {padding}')
+
+    # check padding mode
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+    border_type = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+    img = cv2.copyMakeBorder(
+        img,
+        padding[1],
+        padding[3],
+        padding[0],
+        padding[2],
+        border_type[padding_mode],
+        value=pad_val)
+
+    return img
+
+
+def impad_to_multiple(img, divisor, pad_val=0):
+    """Pad an image to ensure each edge to be multiple to some number.
+
+    Args:
+        img (ndarray): Image to be padded.
+        divisor (int): Padded image edges will be multiple to divisor.
+        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+    Returns:
+        ndarray: The padded image.
+    """
+    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img, shape, pad_val=0):
+    """Randomly cut out a rectangle from the original img.
+
+    Args:
+        img (ndarray): Image to be cutout.
+        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+            int, the value will be used for both h and w.
+        pad_val (int | float | tuple[int | float]): Values to be filled in the
+            cut area. Defaults to 0.
+
+    Returns:
+        ndarray: The cutout image.
+    """
+
+    channels = 1 if img.ndim == 2 else img.shape[2]
+    if isinstance(shape, int):
+        cut_h, cut_w = shape, shape
+    else:
+        assert isinstance(shape, tuple) and len(shape) == 2, \
+            f'shape must be a int or a tuple with length 2, but got type ' \
+            f'{type(shape)} instead.'
+        cut_h, cut_w = shape
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * channels)
+    elif isinstance(pad_val, tuple):
+        assert len(pad_val) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(pad_val), channels)
+    else:
+        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+    img_h, img_w = img.shape[:2]
+    y0 = np.random.uniform(img_h)
+    x0 = np.random.uniform(img_w)
+
+    y1 = int(max(0, y0 - cut_h / 2.))
+    x1 = int(max(0, x0 - cut_w / 2.))
+    y2 = min(img_h, y1 + cut_h)
+    x2 = min(img_w, x1 + cut_w)
+
+    if img.ndim == 2:
+        patch_shape = (y2 - y1, x2 - x1)
+    else:
+        patch_shape = (y2 - y1, x2 - x1, channels)
+
+    img_cutout = img.copy()
+    patch = np.array(
+        pad_val, dtype=img.dtype) * np.ones(
+            patch_shape, dtype=img.dtype)
+    img_cutout[y1:y2, x1:x2, ...] = patch
+
+    return img_cutout
+
+
+def _get_shear_matrix(magnitude, direction='horizontal'):
+    """Generate the shear matrix for transformation.
+
+    Args:
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+
+    Returns:
+        ndarray: The shear matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+    elif direction == 'vertical':
+        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+    return shear_matrix
+
+
+def imshear(img,
+            magnitude,
+            direction='horizontal',
+            border_value=0,
+            interpolation='bilinear'):
+    """Shear an image.
+
+    Args:
+        img (ndarray): Image to be sheared with format (h, w)
+            or (h, w, c).
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The sheared image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`')
+    shear_matrix = _get_shear_matrix(magnitude, direction)
+    sheared = cv2.warpAffine(
+        img,
+        shear_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. shearing masks whose channels large
+        # than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return sheared
+
+
+def _get_translate_matrix(offset, direction='horizontal'):
+    """Generate the translate matrix.
+
+    Args:
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either
+            "horizontal" or "vertical".
+
+    Returns:
+        ndarray: The translate matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+    elif direction == 'vertical':
+        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+    return translate_matrix
+
+
+def imtranslate(img,
+                offset,
+                direction='horizontal',
+                border_value=0,
+                interpolation='bilinear'):
+    """Translate an image.
+
+    Args:
+        img (ndarray): Image to be translated with format
+            (h, w) or (h, w, c).
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The translated image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`.')
+    translate_matrix = _get_translate_matrix(offset, direction)
+    translated = cv2.warpAffine(
+        img,
+        translate_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. translating masks whose channels
+        # large than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return translated
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/io.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..e05effbd95713c3fc76cc5739d6dc616dc997005
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/io.py
@@ -0,0 +1,325 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os.path as osp
+import warnings
+from pathlib import Path
+
+import cv2
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+                 IMREAD_UNCHANGED)
+
+from mmcv.fileio import FileClient
+from mmcv.utils import is_filepath, is_str
+
+try:
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+    from PIL import Image, ImageOps
+except ImportError:
+    Image = None
+
+try:
+    import tifffile
+except ImportError:
+    tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED,
+    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+    'grayscale_ignore_orientation':
+    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend):
+    """Select a backend for image decoding.
+
+    Args:
+        backend (str): The image decoding backend type. Options are `cv2`,
+        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+        file format.
+    """
+    assert backend in supported_backends
+    global imread_backend
+    imread_backend = backend
+    if imread_backend == 'turbojpeg':
+        if TurboJPEG is None:
+            raise ImportError('`PyTurboJPEG` is not installed')
+        global jpeg
+        if jpeg is None:
+            jpeg = TurboJPEG()
+    elif imread_backend == 'pillow':
+        if Image is None:
+            raise ImportError('`Pillow` is not installed')
+    elif imread_backend == 'tifffile':
+        if tifffile is None:
+            raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag='color', channel_order='bgr'):
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'color':
+        if channel_order == 'bgr':
+            return TJPF_BGR
+        elif channel_order == 'rgb':
+            return TJCS_RGB
+    elif flag == 'grayscale':
+        return TJPF_GRAY
+    else:
+        raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img, flag='color', channel_order='bgr'):
+    """Convert a pillow image to numpy array.
+
+    Args:
+        img (:obj:`PIL.Image.Image`): The image loaded using PIL
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are 'color', 'grayscale' and 'unchanged'.
+            Default to 'color'.
+        channel_order (str): The channel order of the output image array,
+            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+    Returns:
+        np.ndarray: The converted numpy array
+    """
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'unchanged':
+        array = np.array(img)
+        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+    else:
+        # Handle exif orientation tag
+        if flag in ['color', 'grayscale']:
+            img = ImageOps.exif_transpose(img)
+        # If the image mode is not 'RGB', convert it to 'RGB' first.
+        if img.mode != 'RGB':
+            if img.mode != 'LA':
+                # Most formats except 'LA' can be directly converted to RGB
+                img = img.convert('RGB')
+            else:
+                # When the mode is 'LA', the default conversion will fill in
+                #  the canvas with black, which sometimes shadows black objects
+                #  in the foreground.
+                #
+                # Therefore, a random color (124, 117, 104) is used for canvas
+                img_rgba = img.convert('RGBA')
+                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+        if flag in ['color', 'color_ignore_orientation']:
+            array = np.array(img)
+            if channel_order != 'rgb':
+                array = array[:, :, ::-1]  # RGB to BGR
+        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+            img = img.convert('L')
+            array = np.array(img)
+        else:
+            raise ValueError(
+                'flag must be "color", "grayscale", "unchanged", '
+                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+                f' but got {flag}')
+    return array
+
+
+def imread(img_or_path,
+           flag='color',
+           channel_order='bgr',
+           backend=None,
+           file_client_args=None):
+    """Read an image.
+
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
+    Args:
+        img_or_path (ndarray or str or Path): Either a numpy array or str or
+            pathlib.Path. If it is a numpy array (loaded image), then
+            it will be returned as is.
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are `color`, `grayscale`, `unchanged`,
+            `color_ignore_orientation` and `grayscale_ignore_orientation`.
+            By default, `cv2` and `pillow` backend would rotate the image
+            according to its EXIF info unless called with `unchanged` or
+            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+            always ignore image's EXIF info regardless of the flag.
+            The `turbojpeg` backend only supports `color` and `grayscale`.
+        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+            If backend is None, the global imread_backend specified by
+            ``mmcv.use_backend()`` will be used. Default: None.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> import mmcv
+        >>> img_path = '/path/to/img.jpg'
+        >>> img = mmcv.imread(img_path)
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+        ...     backend='cv2')
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+        ...     backend='pillow')
+        >>> s3_img_path = 's3://bucket/img.jpg'
+        >>> # infer the file backend by the prefix s3
+        >>> img = mmcv.imread(s3_img_path)
+        >>> # manually set the file backend petrel
+        >>> img = mmcv.imread(s3_img_path, file_client_args={
+        ...     'backend': 'petrel'})
+        >>> http_img_path = 'http://path/to/img.jpg'
+        >>> img = mmcv.imread(http_img_path)
+        >>> img = mmcv.imread(http_img_path, file_client_args={
+        ...     'backend': 'http'})
+    """
+
+    if isinstance(img_or_path, Path):
+        img_or_path = str(img_or_path)
+
+    if isinstance(img_or_path, np.ndarray):
+        return img_or_path
+    elif is_str(img_or_path):
+        file_client = FileClient.infer_client(file_client_args, img_or_path)
+        img_bytes = file_client.get(img_or_path)
+        return imfrombytes(img_bytes, flag, channel_order, backend)
+    else:
+        raise TypeError('"img" must be a numpy array or a str or '
+                        'a pathlib.Path object')
+
+
+def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
+    """Read an image from bytes.
+
+    Args:
+        content (bytes): Image bytes got from files or other streams.
+        flag (str): Same as :func:`imread`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+            None, the global imread_backend specified by ``mmcv.use_backend()``
+            will be used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> img_path = '/path/to/img.jpg'
+        >>> with open(img_path, 'rb') as f:
+        >>>     img_buff = f.read()
+        >>> img = mmcv.imfrombytes(img_buff)
+        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(
+            f'backend: {backend} is not supported. Supported '
+            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+    if backend == 'turbojpeg':
+        img = jpeg.decode(content, _jpegflag(flag, channel_order))
+        if img.shape[-1] == 1:
+            img = img[:, :, 0]
+        return img
+    elif backend == 'pillow':
+        with io.BytesIO(content) as buff:
+            img = Image.open(buff)
+            img = _pillow2array(img, flag, channel_order)
+        return img
+    elif backend == 'tifffile':
+        with io.BytesIO(content) as buff:
+            img = tifffile.imread(buff)
+        return img
+    else:
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if is_str(flag) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+
+def imwrite(img,
+            file_path,
+            params=None,
+            auto_mkdir=None,
+            file_client_args=None):
+    """Write image to file.
+
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
+    Warning:
+        The parameter `auto_mkdir` will be deprecated in the future and every
+        file clients will make directory automatically.
+
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically. It will be deprecated.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Returns:
+        bool: Successful or not.
+
+    Examples:
+        >>> # write to hard disk client
+        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+        >>> # infer the file backend by the prefix s3
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+        >>> # manually set the file backend petrel
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', file_client_args={
+        ...     'backend': 'petrel'})
+    """
+    assert is_filepath(file_path)
+    file_path = str(file_path)
+    if auto_mkdir is not None:
+        warnings.warn(
+            'The parameter `auto_mkdir` will be deprecated in the future and '
+            'every file clients will make directory automatically.')
+    file_client = FileClient.infer_client(file_client_args, file_path)
+    img_ext = osp.splitext(file_path)[-1]
+    # Encode image according to image suffix.
+    # For example, if image path is '/path/your/img.jpg', the encode
+    # format is '.jpg'.
+    flag, img_buff = cv2.imencode(img_ext, img, params)
+    file_client.put(img_buff.tobytes(), file_path)
+    return flag
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/misc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f33962cd321a65abc1c48c33d5b0a2fdd9ad657
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/misc.py
@@ -0,0 +1,66 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def tensor2imgs(tensor, mean=None, std=None, to_rgb=True):
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_rgb (bool, optional): Whether the tensor was converted to RGB
+            format in the first place. If so, convert it back to BGR.
+            For the tensor with 1 channel, it must be False. Defaults to True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    if torch is None:
+        raise RuntimeError('pytorch is not installed')
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+        (channels == len(mean) == len(std) == 1 and not to_rgb)
+
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/photometric.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58eb8a21f6b1e6a16b2176016d4414400ed6076
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/image/photometric.py
@@ -0,0 +1,441 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+
+from ..utils import is_tuple_of
+from .colorspace import bgr2gray, gray2bgr
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+    assert img.dtype != np.uint8
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = cv2.multiply(img, std)  # make a copy
+    cv2.add(img, mean, img)  # inplace
+    if to_bgr:
+        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
+    return img
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+    """Solarize an image (invert all pixel values above a threshold)
+
+    Args:
+        img (ndarray): Image to be solarized.
+        thr (int): Threshold for solarizing (0 - 255).
+
+    Returns:
+        ndarray: The solarized image.
+    """
+    img = np.where(img < thr, img, 255 - img)
+    return img
+
+
+def posterize(img, bits):
+    """Posterize an image (reduce the number of bits for each color channel)
+
+    Args:
+        img (ndarray): Image to be posterized.
+        bits (int): Number of bits (1 to 8) to use for posterizing.
+
+    Returns:
+        ndarray: The posterized image.
+    """
+    shift = 8 - bits
+    img = np.left_shift(np.right_shift(img, shift), shift)
+    return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0):
+    r"""It blends the source image and its gray image:
+
+    .. math::
+        output = img * alpha + gray\_img * beta + gamma
+
+    Args:
+        img (ndarray): The input source image.
+        alpha (int | float): Weight for the source image. Default 1.
+        beta (int | float): Weight for the converted gray image.
+            If None, it's assigned the value (1 - `alpha`).
+        gamma (int | float): Scalar added to each sum.
+            Same as :func:`cv2.addWeighted`. Default 0.
+
+    Returns:
+        ndarray: Colored image which has the same size and dtype as input.
+    """
+    gray_img = bgr2gray(img)
+    gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+    if beta is None:
+        beta = 1 - alpha
+    colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+    if not colored_img.dtype == np.uint8:
+        # Note when the dtype of `img` is not the default `np.uint8`
+        # (e.g. np.float32), the value in `colored_img` got from cv2
+        # is not guaranteed to be in range [0, 255], so here clip
+        # is needed.
+        colored_img = np.clip(colored_img, 0, 255)
+    return colored_img
+
+
+def imequalize(img):
+    """Equalize the image histogram.
+
+    This function applies a non-linear mapping to the input image,
+    in order to create a uniform distribution of grayscale values
+    in the output image.
+
+    Args:
+        img (ndarray): Image to be equalized.
+
+    Returns:
+        ndarray: The equalized image.
+    """
+
+    def _scale_channel(im, c):
+        """Scale the data in the corresponding channel."""
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # For computing the step, filter out the nonzeros.
+        nonzero_histo = histo[histo > 0]
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+        if not step:
+            lut = np.array(range(256))
+        else:
+            # Compute the cumulative sum, shifted by step // 2
+            # and then normalized by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # handle potential integer overflow
+            lut[lut > 255] = 255
+        # If step is zero, return the original image.
+        # Otherwise, index from lut.
+        return np.where(np.equal(step, 0), im, lut[im])
+
+    # Scales each channel independently and then stacks
+    # the result.
+    s1 = _scale_channel(img, 0)
+    s2 = _scale_channel(img, 1)
+    s3 = _scale_channel(img, 2)
+    equalized_img = np.stack([s1, s2, s3], axis=-1)
+    return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1.):
+    """Adjust image brightness.
+
+    This function controls the brightness of an image. An
+    enhancement factor of 0.0 gives a black image.
+    A factor of 1.0 gives the original image. This function
+    blends the source image and the degenerated black image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be brightened.
+        factor (float): A value controls the enhancement.
+            Factor 1.0 returns the original image, lower
+            factors mean less color (brightness, contrast,
+            etc), and higher values more. Default 1.
+
+    Returns:
+        ndarray: The brightened image.
+    """
+    degenerated = np.zeros_like(img)
+    # Note manually convert the dtype to np.float32, to
+    # achieve as close results as PIL.ImageEnhance.Brightness.
+    # Set beta=1-factor, and gamma=0
+    brightened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    brightened_img = np.clip(brightened_img, 0, 255)
+    return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1.):
+    """Adjust image contrast.
+
+    This function controls the contrast of an image. An
+    enhancement factor of 0.0 gives a solid grey
+    image. A factor of 1.0 gives the original image. It
+    blends the source image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+    gray_img = bgr2gray(img)
+    hist = np.histogram(gray_img, 256, (0, 255))[0]
+    mean = round(np.sum(gray_img) / np.sum(hist))
+    degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+    degenerated = gray2bgr(degenerated)
+    contrasted_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    contrasted_img = np.clip(contrasted_img, 0, 255)
+    return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+    """Auto adjust image contrast.
+
+    This function maximize (normalize) image contrast by first removing cutoff
+    percent of the lightest and darkest pixels from the histogram and remapping
+    the image so that the darkest pixel becomes black (0), and the lightest
+    becomes white (255).
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        cutoff (int | float | tuple): The cutoff percent of the lightest and
+            darkest pixels to be removed. If given as tuple, it shall be
+            (low, high). Otherwise, the single value will be used for both.
+            Defaults to 0.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+
+    def _auto_contrast_channel(im, c, cutoff):
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # Remove cut-off percent pixels from histo
+        histo_sum = np.cumsum(histo)
+        cut_low = histo_sum[-1] * cutoff[0] // 100
+        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+        # Compute mapping
+        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+        # If all the values have been cut off, return the origin img
+        if low >= high:
+            return im
+        scale = 255.0 / (high - low)
+        offset = -low * scale
+        lut = np.array(range(256))
+        lut = lut * scale + offset
+        lut = np.clip(lut, 0, 255)
+        return lut[im]
+
+    if isinstance(cutoff, (int, float)):
+        cutoff = (cutoff, cutoff)
+    else:
+        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+            f'float or tuple, but got {type(cutoff)} instead.'
+    # Auto adjusts contrast for each channel independently and then stacks
+    # the result.
+    s1 = _auto_contrast_channel(img, 0, cutoff)
+    s2 = _auto_contrast_channel(img, 1, cutoff)
+    s3 = _auto_contrast_channel(img, 2, cutoff)
+    contrasted_img = np.stack([s1, s2, s3], axis=-1)
+    return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+    """Adjust image sharpness.
+
+    This function controls the sharpness of an image. An
+    enhancement factor of 0.0 gives a blurred image. A
+    factor of 1.0 gives the original image. And a factor
+    of 2.0 gives a sharpened image. It blends the source
+    image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be sharpened. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        kernel (np.ndarray, optional): Filter kernel to be applied on the img
+            to obtain the degenerated img. Defaults to None.
+
+    Note:
+        No value sanity check is enforced on the kernel set by users. So with
+        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+        the function its name indicates but end up performing whatever
+        transform determined by the kernel.
+
+    Returns:
+        ndarray: The sharpened image.
+    """
+
+    if kernel is None:
+        # adopted from PIL.ImageFilter.SMOOTH
+        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+    assert isinstance(kernel, np.ndarray), \
+        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+    assert kernel.ndim == 2, \
+        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+    degenerated = cv2.filter2D(img, -1, kernel)
+    sharpened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    sharpened_img = np.clip(sharpened_img, 0, 255)
+    return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+    """AlexNet-style PCA jitter.
+
+    This data augmentation is proposed in `ImageNet Classification with Deep
+    Convolutional Neural Networks
+    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
+
+    Args:
+        img (ndarray): Image to be adjusted lighting. BGR order.
+        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+
+    Returns:
+        ndarray: The adjusted image.
+    """
+    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+        f'eigval and eigvec should both be of type np.ndarray, got ' \
+        f'{type(eigval)} and {type(eigvec)} instead.'
+
+    assert eigval.ndim == 1 and eigvec.ndim == 2
+    assert eigvec.shape == (3, eigval.shape[0])
+    n_eigval = eigval.shape[0]
+    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+        f'got {type(alphastd)} instead.'
+
+    img = img.copy().astype(np.float32)
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+
+    alpha = np.random.normal(0, alphastd, n_eigval)
+    alter = eigvec \
+        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+    img_adjusted = img + alter
+    return img_adjusted
+
+
+def lut_transform(img, lut_table):
+    """Transform array by look-up table.
+
+    The function lut_transform fills the output array with values from the
+    look-up table. Indices of the entries are taken from the input array.
+
+    Args:
+        img (ndarray): Image to be transformed.
+        lut_table (ndarray): look-up table of 256 elements; in case of
+            multi-channel input array, the table should either have a single
+            channel (in this case the same table is used for all channels) or
+            the same number of channels as in the input array.
+
+    Returns:
+        ndarray: The transformed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert 0 <= np.min(img) and np.max(img) <= 255
+    assert isinstance(lut_table, np.ndarray)
+    assert lut_table.shape == (256, )
+
+    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        img (ndarray): Image to be processed.
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+
+    Returns:
+        ndarray: The processed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert img.ndim == 2
+    assert isinstance(clip_limit, (float, int))
+    assert is_tuple_of(tile_grid_size, int)
+    assert len(tile_grid_size) == 2
+
+    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+    return clahe.apply(np.array(img, dtype=np.uint8))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/deprecated.json b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/deprecated.json
new file mode 100644
index 0000000000000000000000000000000000000000..25cf6f28caecc22a77e3136fefa6b8dfc0e6cb5b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/deprecated.json
@@ -0,0 +1,6 @@
+{
+  "resnet50_caffe": "detectron/resnet50_caffe",
+  "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
+  "resnet101_caffe": "detectron/resnet101_caffe",
+  "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/mmcls.json b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/mmcls.json
new file mode 100644
index 0000000000000000000000000000000000000000..c073a41d0aeb44ee0243f97ecc3558de538f9300
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/mmcls.json
@@ -0,0 +1,59 @@
+{
+  "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
+  "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
+  "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
+  "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
+  "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
+  "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
+  "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
+  "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
+  "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth",
+  "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth",
+  "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth",
+  "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth",
+  "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth",
+  "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth",
+  "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth",
+  "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
+  "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
+  "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
+  "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
+  "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
+  "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
+  "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
+  "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
+  "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
+  "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
+  "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
+  "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth",
+  "mobilenet_v3_small": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth",
+  "mobilenet_v3_large": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth",
+  "repvgg_A0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth",
+  "repvgg_A1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth",
+  "repvgg_A2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth",
+  "repvgg_B0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth",
+  "repvgg_B1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth",
+  "repvgg_B1g2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth",
+  "repvgg_B1g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth",
+  "repvgg_B2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth",
+  "repvgg_B2g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth",
+  "repvgg_B3": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth",
+  "repvgg_B3g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth",
+  "repvgg_D2se": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth",
+  "res2net101_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth",
+  "res2net50_w14": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth",
+  "res2net50_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth",
+  "swin_tiny": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth",
+  "swin_small": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth",
+  "swin_base": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth",
+  "swin_large": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth",
+  "t2t_vit_t_14": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth",
+  "t2t_vit_t_19": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth",
+  "t2t_vit_t_24": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth",
+  "tnt_small": "https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth",
+  "vit_base_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth",
+  "vit_base_p32": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth",
+  "vit_large_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth"
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/open_mmlab.json b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/open_mmlab.json
new file mode 100644
index 0000000000000000000000000000000000000000..8311db4feef92faa0841c697d75efbee8430c3a0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/model_zoo/open_mmlab.json
@@ -0,0 +1,50 @@
+{
+  "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
+  "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
+  "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
+  "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
+  "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
+  "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
+  "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
+  "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
+  "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
+  "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
+  "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
+  "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
+  "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
+  "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
+  "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
+  "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
+  "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
+  "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
+  "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
+  "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
+  "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
+  "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
+  "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
+  "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
+  "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
+  "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
+  "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
+  "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
+  "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
+  "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
+  "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
+  "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
+  "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
+  "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
+  "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
+  "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
+  "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
+  "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
+  "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
+  "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
+  "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
+  "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
+  "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
+  "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
+  "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
+  "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..360c27e38e27dbf71b006766e76d91e56789e264
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/__init__.py
@@ -0,0 +1,18 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .info import is_custom_op_loaded
+from .symbolic import register_extra_symbolics
+
+__all__ = ['register_extra_symbolics', 'is_custom_op_loaded']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/info.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa367f3d94debcd28706c0c580ddc06f8cf6455
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/info.py
@@ -0,0 +1,34 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+
+
+def is_custom_op_loaded():
+    flag = False
+    try:
+        from ..tensorrt import is_tensorrt_plugin_loaded
+        flag = is_tensorrt_plugin_loaded()
+    except (ImportError, ModuleNotFoundError):
+        pass
+    if not flag:
+        try:
+            from ..ops import get_onnxruntime_op_path
+            ort_lib_path = get_onnxruntime_op_path()
+            flag = os.path.exists(ort_lib_path)
+        except (ImportError, ModuleNotFoundError):
+            pass
+    return flag or torch.__version__ == 'parrots'
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5880464d7daa3df842818358dbff34938a119ab3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/__init__.py
@@ -0,0 +1,14 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/symbolic_helper.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/symbolic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a922060a4e5ec0a8451388b273272977abfabd4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/onnx_utils/symbolic_helper.py
@@ -0,0 +1,344 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/pytorch/pytorch."""
+import warnings
+from functools import wraps
+from sys import maxsize
+
+import torch
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+from torch._C import ListType
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+# Save some builtins as locals, because we'll shadown them below
+_sum = sum
+
+
+def _parse_arg(value, desc):
+    if desc == 'none':
+        return value
+    if desc == 'v' or not _is_value(value):
+        return value
+    if value.node().mustBeNone():
+        return None
+    if value.node().kind() == 'onnx::Constant':
+        tval = value.node()['value']
+        if desc == 'i':
+            return int(tval)
+        elif desc == 'f':
+            return float(tval)
+        elif desc == 'b':
+            return bool(tval)
+        elif desc == 's':
+            return str(tval)
+        elif desc == 't':
+            return tval
+        elif desc == 'is':
+            return [int(v) for v in tval]
+        elif desc == 'fs':
+            return [float(v) for v in tval]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret Constant node")
+    elif value.node().kind() == 'prim::ListConstruct':
+        if desc == 'is':
+            for v in value.node().inputs():
+                if v.node().kind() != 'onnx::Constant':
+                    raise RuntimeError(
+                        "Failed to export an ONNX attribute '" +
+                        v.node().kind() +
+                        "', since it's not constant, please try to make "
+                        'things (e.g., kernel size) static if possible')
+            return [int(v.node()['value']) for v in value.node().inputs()]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret ListConstruct node")
+
+    raise RuntimeError('Unexpected node type: {}'.format(value.node().kind()))
+
+
+def _maybe_get_const(value, desc):
+    if _is_value(value) and value.node().kind() == 'onnx::Constant':
+        return _parse_arg(value, desc)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, 't')
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if _is_value(value) and value.node().kind() not in ('onnx::Constant',
+                                                        'prim::Constant'):
+        raise RuntimeError('ONNX symbolic expected a constant'
+                           ' value of the {} argument, got `{}`'.format(
+                               arg_name, value))
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == 'prim::ListConstruct'
+    return list(list_node.inputs())
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be
+# unpacked.
+def _is_packed_list(list_value):
+    return _is_value(
+        list_value) and list_value.node().kind() == 'prim::ListConstruct'
+
+
+def parse_args(*arg_descriptors):
+
+    def decorator(fn):
+        fn._arg_descriptors = arg_descriptors
+
+        def wrapper(g, *args):
+            # some args may be optional, so the length may be smaller
+            assert len(arg_descriptors) >= len(args)
+            args = [
+                _parse_arg(arg, arg_desc)
+                for arg, arg_desc in zip(args, arg_descriptors)
+            ]
+            return fn(g, *args)
+
+        # In Python 2 functools.wraps chokes on partially applied functions, so
+        # we need this as a workaround
+        try:
+            wrapper = wraps(fn)(wrapper)
+        except Exception:
+            pass
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x.item()
+
+
+def _if_scalar_type_as(g, self, tensor):
+    """Convert self into the same type of tensor, as necessary."""
+    if isinstance(self, torch._C.Value):
+        return self
+
+    scalar_type = tensor.type().scalarType()
+    if scalar_type:
+        ty = scalar_type.lower()
+        return getattr(self, ty)()
+
+    return self
+
+
+def _is_none(x):
+    return x.node().mustBeNone()
+
+
+def _is_value(x):
+    return isinstance(x, torch._C.Value)
+
+
+def _is_tensor_list(x):
+    return x.type().isSubtypeOf(ListType.ofTensors())
+
+
+def _unimplemented(op, msg):
+    warnings.warn('ONNX export failed on ' + op + ' because ' + msg +
+                  ' not supported')
+
+
+def _try_get_scalar_type(*args):
+    for arg in args:
+        try:
+            return arg.type().scalarType()
+        except RuntimeError:
+            pass
+    return None
+
+
+def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
+    if out is not None:
+        _unimplemented('TopK', 'Out parameter is not supported')
+    if not _is_value(k):
+        k = g.op('Constant', value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = g.op('Reshape', k, g.op('Constant', value_t=torch.tensor([1])))
+    return g.op(
+        'TopK',
+        input,
+        k,
+        axis_i=dim,
+        largest_i=largest,
+        sorted_i=sorted,
+        outputs=2)
+
+
+def _slice_helper(g,
+                  input,
+                  axes,
+                  starts,
+                  ends,
+                  steps=None,
+                  dynamic_slice=False):
+    # TODO(ruobing): add support for opset<10
+    from torch.onnx.symbolic_opset10 import _slice
+    return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
+
+
+def _unsqueeze_helper(g, input, dim):
+    from torch.onnx.symbolic_opset9 import unsqueeze
+    return unsqueeze(g, input, dim)
+
+
+def _interpolate_size_to_scales(g, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, 'is')
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op(
+            'Constant', value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op(
+            'Cast', output_size, to_i=cast_pytorch_to_onnx['Float'])
+        divisor = _slice_helper(
+            g, g.op('Shape', input), axes=[0], ends=[maxsize], starts=[offset])
+        divisor = g.op('Cast', divisor, to_i=cast_pytorch_to_onnx['Float'])
+        scale_dims = g.op('Div', dividend, divisor)
+        scales = g.op('Concat', offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1. if i < 2 else float(output_size[-(dim - i)]) /
+            float(input.type().sizes()[-(dim - i)]) for i in range(0, dim)
+        ]
+        scales = g.op(
+            'Constant',
+            value_t=torch.tensor(scales_constant, dtype=torch.float32))
+    return scales
+
+
+def _interpolate_get_scales_if_available(g, scales):
+    if len(scales) == 0:
+        return None
+    # scales[0] is NoneType in Pytorch == 1.5.1
+    # scales[0] is TensorType with sizes = [] in Pytorch == 1.6.0
+    # scales[0] is ListType in Pytorch == 1.7.0
+    # scales[0] is TensorType with sizes = [2] in Pytorch == 1.8.0
+    scale_desc = 'fs' if scales[0].type().kind() == 'ListType' or (
+        scales[0].type().kind() == 'TensorType' and
+        (sum(scales[0].type().sizes()) > 1)) else 'f'
+    available_scales = _maybe_get_const(
+        scales[0], scale_desc) != -1 and not _is_none(scales[0])
+
+    if not available_scales:
+        return None
+
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if scale_desc == 'fs':
+        scales_list = g.op(
+            'Constant',
+            value_t=torch.tensor(_maybe_get_const(scales[0], scale_desc)))
+        # modify to support PyTorch==1.7.0
+        # https://github.com/pytorch/pytorch/blob/75ee5756715e7161314ce037474843b68f69fc04/torch/onnx/symbolic_helper.py#L375 # noqa: E501
+        scales = g.op('Concat', offsets, scales_list, axis_i=0)
+    else:
+        # for PyTorch < 1.7.0
+        scales_list = []
+        for scale in scales:
+            unsqueezed_scale = _unsqueeze_helper(g, scale, 0)
+            # ONNX only supports float for the scales. double -> float.
+            unsqueezed_scale = g.op(
+                'Cast', unsqueezed_scale, to_i=cast_pytorch_to_onnx['Float'])
+            scales_list.append(unsqueezed_scale)
+        scales = g.op('Concat', offsets, *scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g, mode, args):
+    if mode == 'nearest':
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g, scale_factor, dim):
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if isinstance(scale_factor.type(), torch._C.ListType):
+        return g.op('Concat', offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, 0)
+        scale_factor = g.op(
+            'Cast', scale_factor, to_i=cast_pytorch_to_onnx['Float'])
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op('Concat', offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _size_helper(g, self, dim):
+    full_shape = g.op('Shape', self)
+    from torch.onnx.symbolic_opset9 import select
+    return select(g, full_shape, g.op('Constant', value_t=torch.tensor([0])),
+                  dim)
+
+
+def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override,
+                    name):
+    if divisor_override and divisor_override.node().kind() != 'prim::Constant':
+        return _unimplemented(name, 'divisor_override')
+    if not stride:
+        stride = kernel_size
+    padding = tuple(tuple_fn(padding))
+    return padding
+
+
+# Metaprogram symbolics for each ATen native specialized cast operator.
+# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
+# ONNX cast node with `to` attribute 'UINT8'
+#
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    'Byte': torch.onnx.TensorProtoDataType.UINT8,
+    'Char': torch.onnx.TensorProtoDataType.INT8,
+    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
+    'Float': torch.onnx.TensorProtoDataType.FLOAT,
+    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
+    'Int': torch.onnx.TensorProtoDataType.INT32,
+    'Long': torch.onnx.TensorProtoDataType.INT64,
+    'Short': torch.onnx.TensorProtoDataType.INT16,
+    'Bool': torch.onnx.TensorProtoDataType.BOOL,
+    'ComplexFloat': torch.onnx.TensorProtoDataType.COMPLEX64,
+    'ComplexDouble': torch.onnx.TensorProtoDataType.COMPLEX128,
+    'Undefined': torch.onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT
+# -> C2 via ONNX.
+_quantized_ops = set()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/symbolic.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/symbolic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c597c0846fa492967785ad83e31c7cbfad3d731
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/onnx/symbolic.py
@@ -0,0 +1,509 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/pytorch/pytorch."""
+import os
+
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_registry import register_op
+
+from .onnx_utils import symbolic_helper as sym_help
+
+
+def _interpolate(name, dim, interpolate_mode):
+
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args)
+        align_corners = sym_help._maybe_get_scalar(align_corners)
+        transformation_mode = 'asymmetric' \
+            if interpolate_mode == 'nearest' \
+            else 'align_corners' if align_corners else 'pytorch_half_pixel'
+        empty_tensor = g.op(
+            'Constant', value_t=torch.tensor([], dtype=torch.float32))
+
+        if scales is None:
+            if 'ONNX_BACKEND' in os.environ and os.environ[
+                    'ONNX_BACKEND'] == 'TensorRT':
+                input_size = input.type().sizes()
+                # slice the first two dim
+                input_size = input_size[:2]
+                # convert output_size to int type
+                output_size = sym_help._maybe_get_const(output_size, 'is')
+                input_size.extend(output_size)
+                output_size = g.op(
+                    'Constant',
+                    value_t=torch.tensor(input_size, dtype=torch.int64))
+            else:
+                input_size = g.op('Shape', input)
+                input_size_beg = sym_help._slice_helper(
+                    g, input_size, axes=[0], ends=[2], starts=[0])
+                output_size = g.op(
+                    'Cast',
+                    output_size,
+                    to_i=sym_help.cast_pytorch_to_onnx['Long'])
+                output_size = g.op(
+                    'Concat', input_size_beg, output_size, axis_i=0)
+            scales = g.op(
+                'Constant', value_t=torch.tensor([], dtype=torch.float32))
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                output_size,
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+        else:
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+upsample_nearest1d = _interpolate('upsample_nearest1d', 3, 'nearest')
+upsample_nearest2d = _interpolate('upsample_nearest2d', 4, 'nearest')
+upsample_nearest3d = _interpolate('upsample_nearest3d', 5, 'nearest')
+upsample_linear1d = _interpolate('upsample_linear1d', 3, 'linear')
+upsample_bilinear2d = _interpolate('upsample_bilinear2d', 4, 'linear')
+upsample_trilinear3d = _interpolate('upsample_trilinear3d', 5, 'linear')
+upsample_bicubic2d = _interpolate('upsample_bicubic2d', 4, 'cubic')
+
+
+@parse_args('v', 'v', 'i', 'i', 'i', 'none')
+def topk(g, self, k, dim, largest, sorted, out=None):
+    return sym_help._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out)
+
+
+def masked_select(g, self, mask):
+    from torch.onnx.symbolic_opset9 import expand_as, nonzero
+    index = nonzero(g, expand_as(g, mask, self))
+    return g.op('GatherND', self, index)
+
+
+def _prepare_onnx_paddings(g, dim, pad):
+    pad_len = torch.onnx.symbolic_opset9.size(
+        g, pad, g.op('Constant', value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    extension = g.op(
+        'Sub',
+        g.op('Mul',
+             g.op('Constant', value_t=torch.tensor(dim, dtype=torch.int64)),
+             g.op('Constant', value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len)
+    pad = g.op('Cast', pad, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    paddings = g.op(
+        'Concat',
+        pad,
+        g.op(
+            'ConstantOfShape',
+            extension,
+            value_t=torch.tensor([0], dtype=torch.int64)),
+        axis_i=0)
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1, 2])))
+    paddings = g.op(
+        'Transpose',
+        torch.onnx.symbolic_opset10.flip(g, paddings, [0]),
+        perm_i=[1, 0])
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1])))
+    padding_c = g.op(
+        'Cast', paddings, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    return padding_c
+
+
+def constant_pad_nd(g, input, padding, value=None):
+    mode = 'constant'
+    value = sym_help._maybe_get_scalar(value)
+    value = sym_help._if_scalar_type_as(g, value, input)
+    pad = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, pad, value, mode_s=mode)
+
+
+def reflection_pad(g, input, padding):
+    mode = 'reflect'
+    paddings = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, paddings, mode_s=mode)
+
+
+reflection_pad1d = reflection_pad
+reflection_pad2d = reflection_pad
+reflection_pad3d = reflection_pad
+
+
+def _avg_pool(name, tuple_fn):
+
+    @parse_args('v', 'is', 'is', 'is', 'i', 'i', 'none')
+    def symbolic_fn(g,
+                    input,
+                    kernel_size,
+                    stride,
+                    padding,
+                    ceil_mode,
+                    count_include_pad,
+                    divisor_override=None):
+        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size,
+                                           stride, divisor_override, name)
+        if not stride:
+            stride = kernel_size
+        if count_include_pad:
+            input = g.op(
+                'Pad',
+                input,
+                g.op(
+                    'Constant',
+                    value_t=torch.tensor(((0, ) * 2 + padding) * 2)),
+                mode_s='constant')
+            padding = (0, ) * len(padding)
+        output = g.op(
+            'AveragePool',
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding * 2,
+            ceil_mode_i=ceil_mode)
+        return output
+
+    return symbolic_fn
+
+
+avg_pool1d = _avg_pool('avg_pool1d', _single)
+avg_pool2d = _avg_pool('avg_pool2d', _pair)
+avg_pool3d = _avg_pool('avg_pool3d', _triple)
+
+
+def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d,
+                                  padding_d, stride_d):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to
+    # input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op('Add', input_d,
+                    g.op('Constant', value_t=torch.tensor(padding_d * 2)))
+    blocks_d = g.op(
+        'Sub', blocks_d,
+        g.op(
+            'Constant',
+            value_t=torch.tensor(dilation_d * (kernel_size_d - 1))))
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op('Range', g.op('Constant', value_t=torch.tensor(0)),
+                            blocks_d,
+                            g.op('Constant', value_t=torch.tensor(stride_d)))
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = np.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op('Constant', value_t=torch.tensor([kernel_grid]))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = g.op(
+        'Unsqueeze', blocks_d_indices, axes_i=[0])  # Reshape to [1, -1]
+    kernel_mask = g.op('Reshape', kernel_grid,
+                       g.op('Constant', value_t=torch.tensor([-1, 1])))
+    block_mask = g.op('Add', blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format:
+    # (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op(
+        'Constant', value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op('Pad', input, pad)
+
+
+def _get_im2col_output_shape(g, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op('Constant', value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op('Constant', value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        'Mul', channel_dim,
+        g.op('Constant', value_t=torch.tensor(kernel_h * kernel_w)))
+
+    return g.op(
+        'Concat',
+        g.op('Unsqueeze', batch_dim, axes_i=[0]),
+        g.op('Unsqueeze', channel_unfolded, axes_i=[0]),
+        g.op('Constant', value_t=torch.tensor([-1])),
+        axis_i=0)
+
+
+def size(g, self, dim=None):
+    if dim is None:
+        return g.op('Shape', self)
+    return sym_help._size_helper(g, self, dim)
+
+
+@parse_args('v', 'is', 'is', 'is', 'is')
+def im2col(g, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op('Constant', value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op('Constant', value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(g, input_h, kernel_h,
+                                                       dilation_h, padding_h,
+                                                       stride_h)
+    blocks_col_indices = _get_im2col_indices_along_dim(g, input_w, kernel_w,
+                                                       dilation_w, padding_w,
+                                                       stride_w)
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    output = g.op('Gather', padded_input, blocks_row_indices, axis_i=2)
+    output = g.op('Gather', output, blocks_col_indices, axis_i=4)
+    output = g.op('Transpose', output, perm_i=[0, 1, 2, 4, 3, 5])
+    return g.op('Reshape', output, output_shape)
+
+
+@parse_args('v', 'i')
+def one_hot(g, self, num_classes):
+    values = g.op('Constant', value_t=torch.LongTensor([0, 1]))
+    depth = g.op('Constant', value_t=torch.LongTensor([num_classes]))
+    return g.op('OneHot', self, depth, values, axis_i=-1)
+
+
+@parse_args('v', 'i', 'none')
+def softmax(g, input, dim, dtype=None):
+    input_dim = input.type().dim()
+    if input_dim:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+        if input_dim == dim + 1:
+            softmax = g.op('Softmax', input, axis_i=dim)
+            if dtype and dtype.node().kind() != 'prim::Constant':
+                parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+                softmax = g.op(
+                    'Cast',
+                    softmax,
+                    to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+            return softmax
+
+    max_value = g.op('ReduceMax', input, axes_i=[dim], keepdims_i=1)
+    input = g.op('Sub', input, max_value)
+    exp = g.op('Exp', input)
+    sum = g.op('ReduceSum', exp, axes_i=[dim])
+    softmax = g.op('Div', exp, sum)
+    if dtype and dtype.node().kind() != 'prim::Constant':
+        parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+        softmax = g.op(
+            'Cast', softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+    return softmax
+
+
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+
+    @parse_args('v', 'is')
+    def symbolic_fn(g, input, output_size):
+        if output_size == [1] * len(output_size) and type == 'AveragePool':
+            return g.op('GlobalAveragePool', input)
+        if not input.isCompleteTensor():
+            if output_size == [1] * len(output_size):
+                return g.op('GlobalMaxPool', input), None
+            raise NotImplementedError(
+                '[Adaptive pool]:input size not accessible')
+        dim = input.type().sizes()[2:]
+        if output_size == [1] * len(output_size) and type == 'MaxPool':
+            return g.op('GlobalMaxPool', input), None
+
+        # compute stride = floor(input_size / output_size)
+        s = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+
+        # compute kernel_size = input_size - (output_size - 1) * stride
+        k = [dim[i] - (output_size[i] - 1) * s[i] for i in range(0, len(dim))]
+
+        # call max_poolxd_with_indices to get indices in the output
+        if type == 'MaxPool':
+            return fn(g, input, k, k, (0, ) * len(dim), (1, ) * len(dim),
+                      False)
+        output = g.op(
+            type,
+            input,
+            kernel_shape_i=tuple_fn(k),
+            strides_i=tuple_fn(s),
+            ceil_mode_i=False)
+        return output
+
+    return symbolic_fn
+
+
+adaptive_avg_pool1d = _adaptive_pool('adaptive_avg_pool1d', 'AveragePool',
+                                     _single)
+adaptive_avg_pool2d = _adaptive_pool('adaptive_avg_pool2d', 'AveragePool',
+                                     _pair)
+adaptive_avg_pool3d = _adaptive_pool('adaptive_avg_pool3d', 'AveragePool',
+                                     _triple)
+
+
+def new_full(g,
+             self,
+             size,
+             fill_value,
+             dtype,
+             layout,
+             device,
+             pin_memory=False):
+    from torch.onnx.symbolic_opset9 import full
+    if dtype is None and self.isCompleteTensor():
+        dtype = self.type().scalarType()
+        dtype = sym_help.scalar_type_to_onnx.index(
+            sym_help.cast_pytorch_to_onnx[dtype])
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@parse_args('v', 'v', 'i', 'i', 'i')
+def grid_sampler(g,
+                 input,
+                 grid,
+                 interpolation_mode,
+                 padding_mode,
+                 align_corners=False):
+    return g.op(
+        'mmcv::grid_sampler',
+        input,
+        grid,
+        interpolation_mode_i=interpolation_mode,
+        padding_mode_i=padding_mode,
+        align_corners_i=align_corners)
+
+
+@parse_args('v', 'i')
+def cummax(g, input, dim):
+    return g.op('mmcv::cummax', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'i')
+def cummin(g, input, dim):
+    return g.op('mmcv::cummin', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'v', 'is')
+def roll(g, input, shifts, dims):
+    from torch.onnx.symbolic_opset9 import squeeze
+    from packaging import version
+    input_shape = g.op('Shape', input)
+
+    need_flatten = len(dims) == 0
+    # If dims is not specified, the tensor will be flattened before
+    # rolling and then restored to the original shape.
+    if need_flatten:
+        resize_shape = input_shape
+        input = g.op('Reshape', input,
+                     g.op('Constant', value_t=torch.LongTensor([1, -1])))
+        input_shape = g.op('Shape', input)
+        dims = [1]
+
+    for index, dim in enumerate(dims):
+        end_size = sym_help._slice_helper(
+            g, input_shape, axes=[0], ends=[dim + 1], starts=[dim])
+        shift_size = sym_help._slice_helper(
+            g, shifts, axes=[0], ends=[index + 1], starts=[index])
+        slice_size = g.op('Sub', end_size, shift_size)
+
+        # Can not use Mod because tensorrt does not support
+        div_size = g.op('Div', slice_size, end_size)
+        slice_size = g.op('Sub', slice_size, g.op('Mul', end_size, div_size))
+
+        if version.parse(torch.__version__) >= version.parse('1.7.0'):
+            # add dim=0 for pytorch 1.9.0
+            end_size = squeeze(g, end_size, 0)
+            slice_size = squeeze(g, slice_size, 0)
+        else:
+            end_size = g.op('Squeeze', end_size)
+            slice_size = g.op('Squeeze', slice_size)
+            dim = torch.LongTensor([dim])
+
+        input_slice0 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            starts=torch.LongTensor([0]),
+            ends=slice_size,
+            dynamic_slice=True)
+        input_slice1 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            ends=end_size,
+            starts=slice_size,
+            dynamic_slice=True)
+
+        input = g.op('Concat', input_slice1, input_slice0, axis_i=dim)
+
+    if need_flatten:
+        input = g.op('Reshape', input, resize_shape)
+
+    return input
+
+
+def register_extra_symbolics(opset=11):
+    register_op('one_hot', one_hot, '', opset)
+    register_op('im2col', im2col, '', opset)
+    register_op('topk', topk, '', opset)
+    register_op('softmax', softmax, '', opset)
+    register_op('constant_pad_nd', constant_pad_nd, '', opset)
+    register_op('reflection_pad1d', reflection_pad1d, '', opset)
+    register_op('reflection_pad2d', reflection_pad2d, '', opset)
+    register_op('reflection_pad3d', reflection_pad3d, '', opset)
+    register_op('avg_pool1d', avg_pool1d, '', opset)
+    register_op('avg_pool2d', avg_pool2d, '', opset)
+    register_op('avg_pool3d', avg_pool3d, '', opset)
+    register_op('adaptive_avg_pool1d', adaptive_avg_pool1d, '', opset)
+    register_op('adaptive_avg_pool2d', adaptive_avg_pool2d, '', opset)
+    register_op('adaptive_avg_pool3d', adaptive_avg_pool3d, '', opset)
+    register_op('masked_select', masked_select, '', opset)
+    register_op('upsample_nearest1d', upsample_nearest1d, '', opset)
+    register_op('upsample_nearest2d', upsample_nearest2d, '', opset)
+    register_op('upsample_nearest3d', upsample_nearest3d, '', opset)
+    register_op('upsample_linear1d', upsample_linear1d, '', opset)
+    register_op('upsample_bilinear2d', upsample_bilinear2d, '', opset)
+    register_op('upsample_trilinear3d', upsample_trilinear3d, '', opset)
+    register_op('upsample_bicubic2d', upsample_bicubic2d, '', opset)
+    register_op('new_full', new_full, '', opset)
+    register_op('grid_sampler', grid_sampler, '', opset)
+    register_op('cummax', cummax, '', opset)
+    register_op('cummin', cummin, '', opset)
+    register_op('roll', roll, '', opset)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f1f6e5802809af20265bef5e8e6c5429b07f784
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/__init__.py
@@ -0,0 +1,103 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .active_rotated_filter import active_rotated_filter
+from .assign_score_withk import assign_score_withk
+from .ball_query import ball_query
+from .bbox import bbox_overlaps
+from .border_align import BorderAlign, border_align
+from .box_iou_rotated import box_iou_rotated
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .contour_expand import contour_expand
+from .convex_iou import convex_giou, convex_iou
+from .corner_pool import CornerPool
+from .correlation import Correlation
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+                              ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .deprecated_wrappers import Conv2d_deprecated as Conv2d
+from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
+from .deprecated_wrappers import Linear_deprecated as Linear
+from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
+from .gather_points import gather_points
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+from .info import (get_compiler_version, get_compiling_cuda_version,
+                   get_onnxruntime_op_path)
+from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
+from .knn import knn
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
+from .pixel_group import pixel_group
+from .point_sample import (SimpleRoIAlign, point_sample,
+                           rel_roi_point_to_rel_img_point)
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                              points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
+from .points_sampler import PointsSampler
+from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
+from .roi_align import RoIAlign, roi_align
+from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
+from .roi_pool import RoIPool, roi_pool
+from .roiaware_pool3d import RoIAwarePool3d
+from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
+from .saconv import SAConv2d
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .sync_bn import SyncBatchNorm
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+from .tin_shift import TINShift, tin_shift
+from .upfirdn2d import upfirdn2d
+from .voxelize import Voxelization, voxelization
+
+__all__ = [
+    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+    'get_compiler_version', 'get_compiling_cuda_version',
+    'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
+    'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+    'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
+    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+    'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated',
+    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
+    'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
+    'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
+    'border_align', 'gather_points', 'furthest_point_sample',
+    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+    'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
+    'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
+    'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all',
+    'points_in_polygons', 'min_area_polygons', 'active_rotated_filter',
+    'convex_iou', 'convex_giou'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/active_rotated_filter.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acda1969bb4566a3464cfff44b2f14da3b0cb17
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/active_rotated_filter.py
@@ -0,0 +1,74 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input, indices):
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out):
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/assign_score_withk.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..a631bbbe4c9693b67f13a87dae5d4fa6d0018180
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/assign_score_withk.py
@@ -0,0 +1,140 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
+
+
+class AssignScoreWithK(Function):
+    r"""Perform weighted sum to generate output features according to scores.
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/paconv_lib/src/gpu>`_.
+
+    This is a memory-efficient CUDA implementation of assign_scores operation,
+    which first transform all point features with weight bank, then assemble
+    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
+
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+        more detailed descriptions.
+
+    Note:
+        This implementation assumes using ``neighbor`` kernel input, which is
+            (point_features - center_features, point_features).
+        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
+        pointnet2/paconv.py#L128 for more details.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                scores,
+                point_features,
+                center_features,
+                knn_idx,
+                aggregate='sum'):
+        """
+        Args:
+            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+                aggregate weight matrices in the weight bank.
+                ``npoint`` is the number of sampled centers.
+                ``K`` is the number of queried neighbors.
+                ``M`` is the number of weight matrices in the weight bank.
+            point_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed point features to be aggregated.
+            center_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed center features to be aggregated.
+            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
+                We assume the first idx in each row is the idx of the center.
+            aggregate (str, optional): Aggregation method.
+                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
+
+        Returns:
+            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
+        """
+        agg = {'sum': 0, 'avg': 1, 'max': 2}
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        output = point_features.new_zeros((B, out_dim, npoint, K))
+        ext_module.assign_score_withk_forward(
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            output,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg[aggregate])
+
+        ctx.save_for_backward(output, point_features, center_features, scores,
+                              knn_idx)
+        ctx.agg = agg[aggregate]
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        """
+        Args:
+            grad_out (torch.Tensor): (B, out_dim, npoint, K)
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
+        """
+        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
+
+        agg = ctx.agg
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        grad_point_features = point_features.new_zeros(point_features.shape)
+        grad_center_features = center_features.new_zeros(center_features.shape)
+        grad_scores = scores.new_zeros(scores.shape)
+
+        ext_module.assign_score_withk_backward(
+            grad_out.contiguous(),
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            grad_point_features,
+            grad_center_features,
+            grad_scores,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg)
+
+        return grad_scores, grad_point_features, \
+            grad_center_features, None, None
+
+
+assign_score_withk = AssignScoreWithK.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/ball_query.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..4495f5a6eb4efb8b1c41a2d5eeca4e8bcc91da22
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/ball_query.py
@@ -0,0 +1,69 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['ball_query_forward'])
+
+
+class BallQuery(Function):
+    """Find nearby points in spherical space."""
+
+    @staticmethod
+    def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
+                xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            min_radius (float): minimum radius of the balls.
+            max_radius (float): maximum radius of the balls.
+            sample_num (int): maximum number of features in the balls.
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query.
+
+        Returns:
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
+        """
+        assert center_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+        assert min_radius < max_radius
+
+        B, N, _ = xyz.size()
+        npoint = center_xyz.size(1)
+        idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int)
+
+        ext_module.ball_query_forward(
+            center_xyz,
+            xyz,
+            idx,
+            b=B,
+            n=N,
+            m=npoint,
+            min_radius=min_radius,
+            max_radius=max_radius,
+            nsample=sample_num)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/bbox.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23c228b3eef175a813ed49c54c0bf5ade6a35f6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/bbox.py
@@ -0,0 +1,87 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
+    """Calculate overlap between two set of bboxes.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5000, 0.0000, 0.0000],
+                [0.0000, 0.0000, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    mode_dict = {'iou': 0, 'iof': 1}
+    assert mode in mode_dict.keys()
+    mode_flag = mode_dict[mode]
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+    assert offset == 1 or offset == 0
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
+
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows, cols))
+    ext_module.bbox_overlaps(
+        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
+    return ious
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/border_align.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a35d93c6a9947ed2292abd6cdc180144f8041e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/border_align.py
@@ -0,0 +1,122 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# modified from
+# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['border_align_forward', 'border_align_backward'])
+
+
+class BorderAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, boxes, pool_size):
+        return g.op(
+            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
+
+    @staticmethod
+    def forward(ctx, input, boxes, pool_size):
+        ctx.pool_size = pool_size
+        ctx.input_shape = input.size()
+
+        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
+        assert boxes.size(2) == 4, \
+            'the last dimension of boxes must be (x1, y1, x2, y2)'
+        assert input.size(1) % 4 == 0, \
+            'the channel for input feature must be divisible by factor 4'
+
+        # [B, C//4, H*W, 4]
+        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
+        output = input.new_zeros(output_shape)
+        # `argmax_idx` only used for backward
+        argmax_idx = input.new_zeros(output_shape).to(torch.int)
+
+        ext_module.border_align_forward(
+            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
+
+        ctx.save_for_backward(boxes, argmax_idx)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        boxes, argmax_idx = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous
+        grad_output = grad_output.contiguous()
+        ext_module.border_align_backward(
+            grad_output,
+            boxes,
+            argmax_idx,
+            grad_input,
+            pool_size=ctx.pool_size)
+        return grad_input, None, None
+
+
+border_align = BorderAlignFunction.apply
+
+
+class BorderAlign(nn.Module):
+    r"""Border align pooling layer.
+
+    Applies border_align over the input feature based on predicted bboxes.
+    The details were described in the paper
+    `BorderDet: Border Feature for Dense Object Detection
+    <https://arxiv.org/abs/2007.11056>`_.
+
+    For each border line (e.g. top, left, bottom or right) of each box,
+    border_align does the following:
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.
+
+    Args:
+        pool_size (int): number of positions sampled over the boxes' borders
+            (e.g. top, bottom, left, right).
+    """
+
+    def __init__(self, pool_size):
+        super(BorderAlign, self).__init__()
+        self.pool_size = pool_size
+
+    def forward(self, input, boxes):
+        """
+        Args:
+            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
+                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
+                right features respectively.
+            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
+
+        Returns:
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
+        """
+        return border_align(input, boxes, self.pool_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(pool_size={self.pool_size})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/box_iou_rotated.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b6631c24893b99b2b7d4beabfec734c080d6d4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/box_iou_rotated.py
@@ -0,0 +1,159 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
+
+
+def box_iou_rotated(bboxes1,
+                    bboxes2,
+                    mode='iou',
+                    aligned=False,
+                    clockwise=True):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows * cols))
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_rotated(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/carafe.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d185f40778d064cc57474fc23fb71163017e5b6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/carafe.py
@@ -0,0 +1,301 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+
+from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+    'carafe_backward'
+])
+
+
+class CARAFENaiveFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+        return g.op(
+            'mmcv::MMCVCARAFENaive',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        ext_module.carafe_naive_forward(
+            features,
+            masks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad:
+            ctx.save_for_backward(features, masks)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+
+        features, masks = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        grad_input = torch.zeros_like(features)
+        grad_masks = torch.zeros_like(masks)
+        ext_module.carafe_naive_backward(
+            grad_output.contiguous(),
+            features,
+            masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        return grad_input, grad_masks, None, None, None
+
+
+carafe_naive = CARAFENaiveFunction.apply
+
+
+class CARAFENaive(Module):
+
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFENaive, self).__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features, masks):
+        return carafe_naive(features, masks, self.kernel_size, self.group_size,
+                            self.scale_factor)
+
+
+class CARAFEFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+        return g.op(
+            'mmcv::MMCVCARAFE',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        routput = features.new_zeros(output.size(), requires_grad=False)
+        rfeatures = features.new_zeros(features.size(), requires_grad=False)
+        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+        ext_module.carafe_forward(
+            features,
+            masks,
+            rfeatures,
+            routput,
+            rmasks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad:
+            ctx.save_for_backward(features, masks, rfeatures)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+
+        features, masks, rfeatures = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input = torch.zeros_like(features, requires_grad=False)
+        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+        grad_input = torch.zeros_like(features, requires_grad=False)
+        grad_masks = torch.zeros_like(masks, requires_grad=False)
+        ext_module.carafe_backward(
+            grad_output.contiguous(),
+            rfeatures,
+            masks,
+            rgrad_output,
+            rgrad_input_hs,
+            rgrad_input,
+            rgrad_masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+
+
+carafe = CARAFEFunction.apply
+
+
+class CARAFE(Module):
+    """ CARAFE: Content-Aware ReAssembly of FEatures
+
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.
+
+    Args:
+        kernel_size (int): reassemble kernel size
+        group_size (int): reassemble group size
+        scale_factor (int): upsample ratio
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFE, self).__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features, masks):
+        return carafe(features, masks, self.kernel_size, self.group_size,
+                      self.scale_factor)
+
+
+@UPSAMPLE_LAYERS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+    """A unified package of CARAFE upsampler that contains: 1) channel
+    compressor 2) content encoder 3) CARAFE op.
+
+    Official implementation of ICCV 2019 paper
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.
+
+    Args:
+        channels (int): input feature channels
+        scale_factor (int): upsample ratio
+        up_kernel (int): kernel size of CARAFE op
+        up_group (int): group size of CARAFE op
+        encoder_kernel (int): kernel size of content encoder
+        encoder_dilation (int): dilation of content encoder
+        compressed_channels (int): output channels of channels compressor
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self,
+                 channels,
+                 scale_factor,
+                 up_kernel=5,
+                 up_group=1,
+                 encoder_kernel=3,
+                 encoder_dilation=1,
+                 compressed_channels=64):
+        super(CARAFEPack, self).__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+                                            1)
+        self.content_encoder = nn.Conv2d(
+            self.compressed_channels,
+            self.up_kernel * self.up_kernel * self.up_group *
+            self.scale_factor * self.scale_factor,
+            self.encoder_kernel,
+            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+            dilation=self.encoder_dilation,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        normal_init(self.content_encoder, std=0.001)
+
+    def kernel_normalizer(self, mask):
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        n, mask_c, h, w = mask.size()
+        # use float division explicitly,
+        # to void inconsistency while exporting to onnx
+        mask_channel = int(mask_c / float(self.up_kernel**2))
+        mask = mask.view(n, mask_channel, -1, h, w)
+
+        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
+        mask = mask.view(n, mask_c, h, w).contiguous()
+
+        return mask
+
+    def feature_reassemble(self, x, mask):
+        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+        return x
+
+    def forward(self, x):
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+
+        x = self.feature_reassemble(x, mask)
+        return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/cc_attention.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42401eae5f73c5219b7db3bc941372be98292b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/cc_attention.py
@@ -0,0 +1,97 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import PLUGIN_LAYERS, Scale
+
+
+def NEG_INF_DIAG(n, device):
+    """Returns a diagonal matrix of size [n, n].
+
+    The diagonal are all "-inf". This is for avoiding calculating the
+    overlapped element in the Criss-Cross twice.
+    """
+    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
+
+
+@PLUGIN_LAYERS.register_module()
+class CrissCrossAttention(nn.Module):
+    """Criss-Cross Attention Module.
+
+    .. note::
+        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
+        to a pure PyTorch and equivalent implementation. For more
+        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
+
+        Speed comparison for one forward pass
+
+        - Input size: [2,512,97,97]
+        - Device: 1 NVIDIA GeForce RTX 2080 Ti
+
+        +-----------------------+---------------+------------+---------------+
+        |                       |PyTorch version|CUDA version|Relative speed |
+        +=======================+===============+============+===============+
+        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+    """
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+        self.gamma = Scale(0.)
+        self.in_channels = in_channels
+
+    def forward(self, x):
+        """forward function of Criss-Cross Attention.
+
+        Args:
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output of the layer, with the shape of
+            (batch_size, in_channels, height, width)
+        """
+        B, C, H, W = x.size()
+        query = self.query_conv(x)
+        key = self.key_conv(x)
+        value = self.value_conv(x)
+        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
+            H, query.device)
+        energy_H = energy_H.transpose(1, 2)
+        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
+        attn = F.softmax(
+            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
+        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
+        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
+
+        out = self.gamma(out) + x
+        out = out.contiguous()
+
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/contour_expand.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..af975e32cc073004673bfdb24e55817ce1dd0edb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/contour_expand.py
@@ -0,0 +1,62 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
+
+
+def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                   kernel_num):
+    """Expand kernel contours so that foreground pixels are assigned into
+    instances.
+
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+            size hxw.
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
+            kernel label with size hxw.
+        min_kernel_area (int): The minimum kernel area.
+        kernel_num (int): The instance kernel number.
+
+    Returns:
+        list: The instance index map with size hxw.
+    """
+    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
+    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(min_kernel_area, int)
+    assert isinstance(kernel_num, int)
+
+    if isinstance(kernel_mask, np.ndarray):
+        kernel_mask = torch.from_numpy(kernel_mask)
+    if isinstance(internal_kernel_label, np.ndarray):
+        internal_kernel_label = torch.from_numpy(internal_kernel_label)
+
+    if torch.__version__ == 'parrots':
+        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
+            label = []
+        else:
+            label = ext_module.contour_expand(
+                kernel_mask,
+                internal_kernel_label,
+                min_kernel_area=min_kernel_area,
+                kernel_num=kernel_num)
+            label = label.tolist()
+    else:
+        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
+                                          min_kernel_area, kernel_num)
+    return label
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/convex_iou.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b30c0e8e0c85cacea9ad7501096bc044e025a6e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/convex_iou.py
@@ -0,0 +1,59 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets, polygons):
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets, polygons):
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/corner_pool.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..aecd96bb9916db806b1ebe172aaf1b7dcf5235f1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/corner_pool.py
@@ -0,0 +1,176 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
+    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
+    'right_pool_forward', 'right_pool_backward'
+])
+
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+
+
+class TopPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.top_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.top_pool_backward(input, grad_output)
+        return output
+
+
+class BottomPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.bottom_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.bottom_pool_backward(input, grad_output)
+        return output
+
+
+class LeftPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.left_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.left_pool_backward(input, grad_output)
+        return output
+
+
+class RightPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.right_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.right_pool_backward(input, grad_output)
+        return output
+
+
+class CornerPool(nn.Module):
+    """Corner Pooling.
+
+    Corner Pooling is a new type of pooling layer that helps a
+    convolutional network better localize corners of bounding boxes.
+
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
+    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+
+    Args:
+        mode (str): Pooling orientation for the pooling layer
+
+            - 'bottom': Bottom Pooling
+            - 'left': Left Pooling
+            - 'right': Right Pooling
+            - 'top': Top Pooling
+
+    Returns:
+        Feature map after pooling.
+    """
+
+    pool_functions = {
+        'bottom': BottomPoolFunction,
+        'left': LeftPoolFunction,
+        'right': RightPoolFunction,
+        'top': TopPoolFunction,
+    }
+
+    cummax_dim_flip = {
+        'bottom': (2, False),
+        'left': (3, True),
+        'right': (3, False),
+        'top': (2, True),
+    }
+
+    def __init__(self, mode):
+        super(CornerPool, self).__init__()
+        assert mode in self.pool_functions
+        self.mode = mode
+        self.corner_pool = self.pool_functions[mode]
+
+    def forward(self, x):
+        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
+            if torch.onnx.is_in_onnx_export():
+                assert torch.__version__ >= '1.7.0', \
+                    'When `cummax` serves as an intermediate component whose '\
+                    'outputs is used as inputs for another modules, it\'s '\
+                    'expected that pytorch version must be >= 1.7.0, '\
+                    'otherwise Error appears like: `RuntimeError: tuple '\
+                    'appears in op that does not forward tuples, unsupported '\
+                    'kind: prim::PythonOp`.'
+
+            dim, flip = self.cummax_dim_flip[self.mode]
+            if flip:
+                x = x.flip(dim)
+            pool_tensor, _ = torch.cummax(x, dim=dim)
+            if flip:
+                pool_tensor = pool_tensor.flip(dim)
+            return pool_tensor
+        else:
+            return self.corner_pool.apply(x)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/correlation.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f520636431baaeff4ba752955434b3b2f0aaee31
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/correlation.py
@@ -0,0 +1,209 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['correlation_forward', 'correlation_backward'])
+
+
+class CorrelationFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input1,
+                input2,
+                kernel_size=1,
+                max_displacement=1,
+                stride=1,
+                padding=1,
+                dilation=1,
+                dilation_patch=1):
+
+        ctx.save_for_backward(input1, input2)
+
+        kH, kW = ctx.kernel_size = _pair(kernel_size)
+        patch_size = max_displacement * 2 + 1
+        ctx.patch_size = patch_size
+        dH, dW = ctx.stride = _pair(stride)
+        padH, padW = ctx.padding = _pair(padding)
+        dilationH, dilationW = ctx.dilation = _pair(dilation)
+        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
+            dilation_patch)
+
+        output_size = CorrelationFunction._output_size(ctx, input1)
+
+        output = input1.new_zeros(output_size)
+
+        ext_module.correlation_forward(
+            input1,
+            input2,
+            output,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input1, input2 = ctx.saved_tensors
+
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilation_patchH, dilation_patchW = ctx.dilation_patch
+        dH, dW = ctx.stride
+        grad_input1 = torch.zeros_like(input1)
+        grad_input2 = torch.zeros_like(input2)
+
+        ext_module.correlation_backward(
+            grad_output,
+            input1,
+            input2,
+            grad_input1,
+            grad_input2,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+        return grad_input1, grad_input2, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input1):
+        iH, iW = input1.size(2), input1.size(3)
+        batch_size = input1.size(0)
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        dH, dW = ctx.stride
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilatedKH = (kH - 1) * dilationH + 1
+        dilatedKW = (kW - 1) * dilationW + 1
+
+        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
+        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
+
+        output_size = (batch_size, patch_size, patch_size, oH, oW)
+        return output_size
+
+
+class Correlation(nn.Module):
+    r"""Correlation operator
+
+    This correlation operator works for optical flow correlation computation.
+
+    There are two batched tensors with shape :math:`(N, C, H, W)`,
+    and the correlation output's shape is :math:`(N, max\_displacement \times
+    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
+
+    where
+
+    .. math::
+        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
+            dilation \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
+            \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
+    window convolution between input1 and shifted input2,
+
+    .. math::
+        Corr(N_i, dx, dy) =
+        \sum_{c=0}^{C-1}
+        input1(N_i, c) \star
+        \mathcal{S}(input2(N_i, c), dy, dx)
+
+    where :math:`\star` is the valid 2d sliding window convolution operator,
+    and :math:`\mathcal{S}` means shifting the input features (auto-complete
+    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
+    [-max\_displacement \times dilation\_patch, max\_displacement \times
+    dilation\_patch]`.
+
+    Args:
+        kernel_size (int): The size of sliding window i.e. local neighborhood
+            representing the center points and involved in correlation
+            computation. Defaults to 1.
+        max_displacement (int): The radius for computing correlation volume,
+            but the actual working space can be dilated by dilation_patch.
+            Defaults to 1.
+        stride (int): The stride of the sliding blocks in the input spatial
+            dimensions. Defaults to 1.
+        padding (int): Zero padding added to all four sides of the input1.
+            Defaults to 0.
+        dilation (int): The spacing of local neighborhood that will involved
+            in correlation. Defaults to 1.
+        dilation_patch (int): The spacing between position need to compute
+            correlation.  Defaults to 1.
+    """
+
+    def __init__(self,
+                 kernel_size: int = 1,
+                 max_displacement: int = 1,
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 dilation_patch: int = 1) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.max_displacement = max_displacement
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.dilation_patch = dilation_patch
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        return CorrelationFunction.apply(input1, input2, self.kernel_size,
+                                         self.max_displacement, self.stride,
+                                         self.padding, self.dilation,
+                                         self.dilation_patch)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(kernel_size={self.kernel_size}, '
+        s += f'max_displacement={self.max_displacement}, '
+        s += f'stride={self.stride}, '
+        s += f'padding={self.padding}, '
+        s += f'dilation={self.dilation}, '
+        s += f'dilation_patch={self.dilation_patch})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/README.md b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3bc02004017a0d607131b4de168b320c3beed23c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/README.md
@@ -0,0 +1,170 @@
+# Code Structure of CUDA operators
+
+This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.
+
+## Directories Tree
+
+```folder
+.
+├── common
+│   ├── box_iou_rotated_utils.hpp
+│   ├── parrots_cpp_helper.hpp
+│   ├── parrots_cuda_helper.hpp
+│   ├── pytorch_cpp_helper.hpp
+│   ├── pytorch_cuda_helper.hpp
+│   ├── pytorch_device_registry.hpp
+│   └── cuda
+│       ├── common_cuda_helper.hpp
+│       ├── parrots_cudawarpfunction.cuh
+│       ├── ...
+│       └── ops_cuda_kernel.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── info.cpp
+│   ├── pybind.cpp
+│   ├── ...
+│   ├── ops.cpp
+│   ├── cuda
+│   │   ├── ...
+│   │   └── ops_cuda.cu
+│   └── cpu
+│       ├── ...
+│       └── ops.cpp
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
+
+## Components
+
+- `common`: This directory contains all tools and shared codes.
+  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
+- `onnxruntime`: **ONNX Runtime** support for custom ops.
+  - `cpu`: CPU implementation of supported ops.
+- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
+- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
+  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
+  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
+- `tensorrt`: **TensorRT** support for custom ops.
+  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
+
+## How to add new PyTorch ops?
+
+1. (Optional) Add shared kernel in `common` to support special hardware platform.
+
+    ```c++
+    // src/common/cuda/new_ops_cuda_kernel.cuh
+
+    template <typename T>
+    __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+        // forward here
+    }
+
+    ```
+
+    Add cuda kernel launcher in `pytorch/cuda`.
+
+    ```c++
+    // src/pytorch/cuda
+    #include <new_ops_cuda_kernel.cuh>
+
+    void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+        // initialize
+        at::cuda::CUDAGuard device_guard(input.device());
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        ...
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+                new_ops_forward_cuda_kernel<scalar_t>
+                    <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                        input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+            }));
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+    ```
+
+2. Register implementation for different devices.
+
+    ```c++
+    // src/pytorch/cuda/cudabind.cpp
+    ...
+
+    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+        // implement cuda forward here
+        // use `NewOpsForwardCUDAKernelLauncher` here
+    }
+    // declare interface here.
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+    // register the implementation for given device (CUDA here).
+    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+    ```
+
+3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
+
+    ```c++
+    // src/pytorch/new_ops.cpp
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+        // dispatch the implementation according to the device type of input.
+        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+    }
+    ...
+
+    Tensor new_ops_forward(Tensor input, Tensor output, ...){
+        return new_ops_forward_impl(input, output, ...);
+    }
+    ```
+
+4. Binding the implementation in `pytorch/pybind.cpp`
+
+    ```c++
+    // src/pytorch/pybind.cpp
+
+    ...
+
+    Tensor new_ops_forward(Tensor input, Tensor output, ...);
+
+    ...
+
+    // bind with pybind11
+    m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+            py::arg("input"), py::arg("output"), ...);
+
+    ...
+
+    ```
+
+5. Build MMCV again. Enjoy new ops in python
+
+    ```python
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+
+    ...
+
+    ext_module.new_ops_forward(input, output, ...)
+
+    ```
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/box_iou_rotated_utils.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..243200e156f1384b625d6bac7fa4c68e533d9441
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,347 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int& num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T>& A, const Point<T>& B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+}  // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+                                            T const* const box2_raw,
+                                            const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0f7454032bc12d4304b923709b09d335daa3cd07
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,72 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2770fc4db71ddafc555778abfb7618c459f91110
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,129 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
+//       i(k) = idx(b,i,k)
+//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template <typename T>
+__global__ void assign_score_withk_forward_cuda_kernel(
+    const int B, const int N0, const int N1, const int M, const int K,
+    const int O, const int aggregate, const T* points, const T* centers,
+    const T* scores, const int64_t* knn_idx, T* output) {
+  // ----- parallel loop for B, N1, K and O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* scores,
+    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+  // ----- parallel loop for B, M, O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
+
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* points,
+    const T* centers, const int64_t* knn_idx, T* grad_scores) {
+  // ----- parallel loop for B, N, K, M ---------
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
+  }
+}
+
+#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d003d46549f0f833c8df14b092d56fbfc837b3a2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,71 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+                                               float min_radius,
+                                               float max_radius, int nsample,
+                                               const T* new_xyz, const T* xyz,
+                                               int* idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
+        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+  }
+}
+
+#endif  // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..55069b95b9bb23ee1dc2e71b192c4e1bc0b80b86
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,97 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      int b1 = index / num_bbox2;
+      int b2 = index % num_bbox2;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  }
+}
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e89278533c851829177f86d91c68ff3e5452f06d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,213 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template <typename T>
+__global__ void border_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* boxes, T* output,
+    int* argmax_idx, const int channels, const int box_size, const int height,
+    const int width, const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+    const T *offset_box, *offset_input, *offset_box_x;
+    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+        val, maxval;
+
+    extreme_idx = threadIdx.y;
+    // shape (N, C, box_size, 4) for output
+    batch_idx = index / channels / box_size;
+    // shape (N, box_size, 4) for boxes
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_output = output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // shape (N, 4C, h, w) for input.
+    // [0,C) for top feature, [C,2C) for left feature,
+    // [2C,3C) for bottom feature, [3C,4C) for right feature
+    offset_input =
+        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+                    height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    // (x1,y1) or (x2,y2) for (x,y)
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+    // (x2,y2))
+    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+    maxidx = 0;
+
+    // do max_pool along the border
+    for (int i = 1; i <= pool_size; i++) {
+      x += x_stride;
+      y += y_stride;
+      val = bilinear_interpolate(offset_input, height, width, y, x, index);
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+
+    // update output and argmax_idx
+    *offset_output = maxval;
+    *offset_argmax_idx = maxidx;
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void border_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* boxes,
+    const int* argmax_idx, T* grad_input, const int channels,
+    const int box_size, const int height, const int width,
+    const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx;
+    const int* offset_argmax_idx;
+    const T *offset_grad_output, *offset_box, *offset_box_x;
+    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+        y;
+
+    extreme_idx = threadIdx.y;
+    batch_idx = index / channels / box_size;
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_grad_output = grad_output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // [0,C) for top feature grad, [C,2C) for left feature grad,
+    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+    offset_grad_input = grad_input + (batch_idx * channels * 4 +
+                                      extreme_idx * channels + c_idx) *
+                                         height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // get position (x,y) which has maximum value during forward
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+    x += x_stride * (T)(*offset_argmax_idx);
+    y += y_stride * (T)(*offset_argmax_idx);
+
+    T w1, w2, w3, w4;
+    int x_low, x_high, y_low, y_high;
+    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+                                  x_high, y_low, y_high, index);
+
+    // update grad_output
+    atomicAdd(offset_grad_input + y_low * width + x_low,
+              *offset_grad_output * w1);
+    atomicAdd(offset_grad_input + y_low * width + x_high,
+              *offset_grad_output * w2);
+    atomicAdd(offset_grad_input + y_high * width + x_low,
+              *offset_grad_output * w3);
+    atomicAdd(offset_grad_input + y_high * width + x_high,
+              *offset_grad_output * w4);
+  }
+}
+
+#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bcfceb0d68cf7c99e2e7a845b18f095fe37d8271
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,345 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#ifdef HIP_DIFF
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+#ifndef HIP_DIFF
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+#endif
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+  return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
+#else
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
+#endif
+  return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+
+  scalar_t output_val = 0;
+
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+#ifdef HIP_DIFF
+  __syncthreads();
+#else
+  __syncwarp();
+#endif
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1e3b3876083d81130d837cfba6209e1b97c4b64c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,124 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/common_cuda_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,120 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..98ce7cdb384810188cb6552de27cc237fd865b7e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,844 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+  double x, y;
+  __device__ Point() {}
+  __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+  for (int i = 0; i < (n - 1) / 2.0; i++) {
+    Point* j = &(a[i]);
+    Point* k = &(a[n - 1 - i]);
+    swap1(j, k);
+  }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+  ps[n] = ps[0];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+  }
+  return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+                                           int* polygon_to_pred_index,
+                                           int n_pred, double* grad_C) {
+  ps[n] = ps[0];
+  double partion_grad[4 * 30 + 2];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+    partion_grad[i * 4 + 2] = ps[i + 1].y;
+    partion_grad[i * 4 + 3] = -ps[i + 1].x;
+    if (i != n - 1) {
+      partion_grad[i * 4 + 4] = -ps[i].y;
+      partion_grad[i * 4 + 5] = ps[i].x;
+    } else {
+      partion_grad[0] = -ps[i].y;
+      partion_grad[1] = ps[i].x;
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+        break;
+      }
+    }
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+        break;
+      }
+    }
+  }
+
+  return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+                                double* cut_grad, int m, int n, int i) {
+  double s1, s2;
+  double s2_s1_2;
+  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+
+  ds1_dxc = -(b.y - a.y);
+  ds1_dyc = b.x - a.x;
+  ds2_dxd = ds1_dxc;
+  ds2_dyd = ds1_dyc;
+  s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+
+  dxp_dxc =
+      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dxp_dyc =
+      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dxp_dxd =
+      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dxp_dyd =
+      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  dyp_dxc =
+      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dyp_dyc =
+      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dyp_dxd =
+      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dyp_dyd =
+      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  if (i == n - 1) {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 3] = dyp_dyd;
+  } else {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+  }
+
+  return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+                                   double* cut_grad) {
+  Point pp[MAXN];
+  double ccur_grad[MAXN] = {};
+  int m = 0;
+  p[n] = p[0];
+  int k = n;
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      ccur_grad[4 * n * m + 4 * i] = 1.0;
+      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+      m++;
+    }
+  }
+
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      for (int j = 0; j < 4 * k; j++) {
+        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+      }
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+                                       double* grad_AB, int order,
+                                       int convex_n) {
+  Point o(0, 0);
+  int res_flag = 0;
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+    res_flag = 1;
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3, n0 = 3, n1, n2, n3;
+  double cut_grad1[MAXN] = {};
+  double cut_grad2[MAXN] = {};
+  double cut_grad3[MAXN] = {};
+  double p1_p_grad[10][10] = {};
+  double p2_p1_grad[10][10] = {};
+  double p3_p2_grad[10][10] = {};
+
+  double p3_p1_grad[10][10] = {};
+  double p3_p_grad[10][10] = {};
+
+  // 1
+  polygon_cut(p, n, o, c, cut_grad1);
+  n1 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n0; j++) {
+      if (!(j % 2)) {
+        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+      } else {
+        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+      }
+    }
+  }
+
+  // 2
+  polygon_cut(p, n, c, d, cut_grad2);
+  n2 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n1; j++) {
+      if (!(j % 2)) {
+        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+      } else {
+        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+      }
+    }
+  }
+  // 3
+  polygon_cut(p, n, d, o, cut_grad3);
+  n3 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n2; j++) {
+      if (!(j % 2)) {
+        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+      } else {
+        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+      }
+    }
+  }
+
+  // mul
+  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n1; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n2; m++) {
+        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+      }
+      p3_p1_grad[i][j] = sum;
+    }
+  }
+
+  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n0; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n1; m++) {
+        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+      }
+      p3_p_grad[i][j] = sum;
+    }
+  }
+
+  // calculate S_grad
+  int polygon_index_box_index[20];
+  double grad_polygon[20];
+  double S_grad[6];
+
+  for (int i = 0; i < n3; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n3] = i;
+  }
+
+  double res =
+      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+  if (s1 * s2 == -1) {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+    res = -res;
+  } else {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+  }
+  return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+                                        double* grad_AB) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res +=
+          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+    }
+  }
+  return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[NMAX] = {}, top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+                                           int n2, double* grad_C) {
+  Point polygon[MAXN];
+  int n = n1 + n2, n_poly = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n - n1; j++) {
+      if (point_same(ps1[i], ps2[j])) {
+        for (int k = j; k < n - n1 - 1; k++) {
+          ps2[k] = ps2[k + 1];
+        }
+        n2--;
+        break;
+      }
+    }
+  }
+  n_poly = n1 + n2;
+  for (int i = 0; i < n_poly; i++) {
+    if (i < n1) {
+      polygon[i] = ps1[i];
+    } else {
+      polygon[i] = ps2[i - n1];
+    }
+  }
+
+  Jarvis(polygon, n_poly);
+
+  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int n_pred = 0;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n1; j++) {
+      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+        polygon_to_pred_index[n_pred] = i;
+        polygon_to_pred_index[n_pred + n1] = j;
+        n_pred += 1;
+        break;
+      }
+    }
+  }
+  if (n_pred == 0) {
+    double polygon_area = fabs(area(polygon, n_poly));
+    for (int i = 0; i < 18; i++) {
+      grad_C[i] = 0.0;
+    }
+    return polygon_area;
+  } else {
+    double polygon_area =
+        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+    if (polygon_area < 0) {
+      for (int i = 0; i < 18; i++) {
+        grad_C[i] = -grad_C[i];
+      }
+    }
+    return fabs(polygon_area);
+  }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+                                        int* points_to_convex_ind) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n_input; j++) {
+      if (point_same(in_poly[i], input_poly[j])) {
+        points_to_convex_ind[i] = j;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q,
+                                T* point_grad, const int idx) {
+  Point ps1[MAXN], ps2[MAXN];
+
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+  int n1 = n_convex;
+  int n2 = 4;
+
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+
+  int polygon_index_box_index[18];
+  for (int i = 0; i < n1; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n1] = i;
+  }
+
+  double grad_A[18] = {};
+  double grad_AB[18] = {};
+  double grad_C[18] = {};
+
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+  double S_pred =
+      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+  if (S_pred < 0) {
+    for (int i = 0; i < n_convex * 2; i++) {
+      grad_A[i] = -grad_A[i];
+    }
+  }
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+  double iou = inter_area / union_area;
+  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+  //    printf("%d:live\n", idx);
+  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+  float grad_point_temp[18] = {};
+
+  for (int i = 0; i < n_convex; i++) {
+    int grad_point = points_to_convex_ind[i];
+    grad_point_temp[2 * grad_point] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i] -
+                iou / union_area * grad_A[2 * i] -
+                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+    grad_point_temp[2 * grad_point + 1] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i + 1] -
+                iou / union_area * grad_A[2 * i + 1] -
+                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 9; i++) {
+    point_grad[2 * i] = grad_point_temp[2 * i];
+    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+  }
+  return (float)rot_giou;
+}
+
+template <typename T>
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+                                        const int gt_n_boxes, const T* ex_boxes,
+                                        const T* gt_boxes, T* point_grad) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    const T* cur_gt_box = gt_boxes + index * 8;
+    T* cur_grad = point_grad + index * 19;
+    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+    cur_grad[18] = giou;
+  }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+  double s1, s2;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+  Point pp[MAXN];
+  int m = 0;
+  p[n] = p[0];
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m]);
+      m++;
+    }
+  }
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+  Point o(0, 0);
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3;
+
+  polygon_cut(p, n, o, c);
+  polygon_cut(p, n, c, d);
+  polygon_cut(p, n, d, o);
+  double res = area(p, n);
+  if (s1 * s2 == -1) res = -res;
+  return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+                                        int n2) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+    }
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+  Point ps1[MAXN], ps2[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+  int n2 = 4;
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+  double S_pred = area(ps1, n1);
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+  double iou = inter_area / union_area;
+  return (float)iou;
+}
+
+template <typename T>
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+                                       const int gt_n_boxes, const T* ex_boxes,
+                                       const T* gt_boxes, T* iou) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    for (int i = 0; i < gt_n_boxes; i++) {
+      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+    }
+  }
+}
+#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/correlation_cuda.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c4674fab776a409cd4a0019c85b2f39699e6dc29
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,244 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <iostream>
+#include <vector>
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define THREADS_FORWARD 32
+#define THREADS_BACKWARD 16
+
+template <typename scalar_t>
+__global__ void correlation_forward_cuda_kernel(
+    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int thread = threadIdx.x;
+
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+  __shared__ scalar_t prod_sum[THREADS_FORWARD];
+
+  for (int ph = 0; ph < patchH; ++ph) {
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for (int pw = 0; pw < patchW; ++pw) {
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      prod_sum[thread] = 0;
+      for (int i = 0; i < kH; ++i) {
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if
+          WITHIN_BOUNDS(i1, i2, iH, iH) {
+            for (int j = 0; j < kW; ++j) {
+              int j1 = start_j + j * dilationW;
+              int j2 = j1 + pw_dilated;
+              if
+                WITHIN_BOUNDS(j1, j2, iW, iW) {
+                  for (int c = thread; c < C; c += THREADS_FORWARD) {
+                    scalar_t v1 = rInput1[n][i1][j1][c];
+                    scalar_t v2 = rInput2[n][i2][j2][c];
+                    prod_sum[thread] += v1 * v2;
+                  }
+                }
+            }
+          }
+      }
+      // accumulate
+      __syncthreads();
+      if (thread == 0) {
+        scalar_t reduce_sum = 0;
+        for (int index = 0; index < THREADS_FORWARD; ++index) {
+          reduce_sum += prod_sum[index];
+        }
+        output[n][ph][pw][h][w] = reduce_sum;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input1(
+    const TensorAcc5R grad_output, const TensorAcc4R input2,
+    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+    const int patchW, const int padH, const int padW, const int dilationH,
+    const int dilationW, const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW, const int batch) {
+  const int iH = input2.size(2);
+  const int iW = input2.size(3);
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w + dilation_patchW * (pw - patchRadW);
+      if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+        scalar_t val = input2[n][c][i1][j1];
+        for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+          int i2 = (h_3) / dH;
+          if (i2 * dH != h_3) continue;
+          for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+            int j2 = (w_3) / dW;
+            if (j2 * dW != w_3) continue;
+            if
+              WITHIN_BOUNDS(i2, j2, H, W) {
+                prod_sum[ph_off][pw_off] +=
+                    grad_output[n][ph][pw][i2][j2] * val;
+              }
+          }
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    grad_input1[n][c][h][w] = reduce_sum;
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input2(
+    const TensorAcc5R grad_output, const TensorAcc4R input1,
+    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW, int batch) {
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w - dilation_patchW * (pw - patchRadW);
+      if
+        WITHIN_BOUNDS(i1, j1, iH, iW) {
+          scalar_t val = input1[n][c][i1][j1];
+
+          const int h_2 = i1 + padH;
+          const int w_2 = j1 + padW;
+          const int min_h = h_2 - dilatedKH;
+          const int min_w = w_2 - dilatedKW;
+
+          for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+            int i2 = (h_3) / dH;
+            if (i2 * dH != h_3) continue;
+            for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+              int j2 = (w_3) / dW;
+              if (j2 * dW != w_3) continue;
+              if
+                WITHIN_BOUNDS(i2, j2, H, W) {
+                  prod_sum[ph_off][pw_off] +=
+                      grad_output[n][ph][pw][i2][j2] * val;
+                }
+            }
+          }
+        }
+    }
+  }
+
+  __syncthreads();
+
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    grad_input2[n][c][h][w] = reduce_sum;
+  }
+}
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..850c338bb74e2ce9b8aca6e53863887e4d25675a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,199 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7242536b47995d93a885170749084a97f2c490f9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,165 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // if (mag <= 1e-3)
+      // continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, N)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * n;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    // float x1 = dataset[old * 3 + 0];
+    // float y1 = dataset[old * 3 + 1];
+    // float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      // float x2, y2, z2;
+      // x2 = dataset[k * 3 + 0];
+      // y2 = dataset[k * 3 + 1];
+      // z2 = dataset[k * 3 + 2];
+
+      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+      // (z2 - z1);
+      float d = dataset[old * n + k];
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cabcd4a6528270a31409a69048e6f4faa9d392ea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,71 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define TOTAL_THREADS 1024
+
+template <typename T>
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+                                                  const T *points,
+                                                  const int *__restrict__ idx,
+                                                  T *out) {
+  // points: (B, C, N)
+  // idx: (B, M)
+  // output:
+  //      out: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+  }
+}
+
+template <typename T>
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+                                                   const T *grad_out,
+                                                   const int *__restrict__ idx,
+                                                   T *grad_points) {
+  // grad_out: (B, C, M)
+  // idx: (B, M)
+  // output:
+  //      grad_points: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8a28149f09a714e6afc3cea98ff74b4472a409c6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,78 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+                                                 int npoints, int nsample,
+                                                 const T *points,
+                                                 const int *__restrict__ idx,
+                                                 T *out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
+
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                  pt_idx * nsample + sample_idx;
+
+    out[out_idx] = points[in_idx];
+  }
+}
+
+template <typename T>
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+                                                  int npoints, int nsample,
+                                                  const T *grad_out,
+                                                  const int *__restrict__ idx,
+                                                  T *grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d6a04d575e1f43314656f7046aee3b491ba92e53
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,383 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (5) [x1, y1, x2, y2, angle]
+  const float MARGIN = 1e-5;
+
+  float center_x = (box[0] + box[2]) / 2;
+  float center_y = (box[1] + box[3]) / 2;
+  float angle_cos = cos(-box[4]),
+        angle_sin =
+            sin(-box[4]);  // rotate the point in the opposite direction of box
+  float rot_x =
+      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
+  float rot_y =
+      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
+
+  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
+          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans_point) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans_point.x = (b0 * c1 - b1 * c0) / D;
+    ans_point.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+
+  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
+        a_angle = box_a[4];
+  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
+        b_angle = box_b[4];
+
+  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
+  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
+  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+    const int num_a, const float *boxes_a, const int num_b,
+    const float *boxes_b, float *ans_overlap) {
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
+    const float *cur_box_a = boxes_a + a_idx * 5;
+    const float *cur_box_b = boxes_b + b_idx * 5;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+  }
+}
+
+__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
+                                                        const float *boxes_a,
+                                                        const int num_b,
+                                                        const float *boxes_b,
+                                                        float *ans_iou) {
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
+
+    const float *cur_box_a = boxes_a + a_idx * 5;
+    const float *cur_box_b = boxes_b + b_idx * 5;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+  }
+}
+
+__global__ void nms_forward_cuda_kernel(const int boxes_num,
+                                        const float nms_overlap_thresh,
+                                        const float *boxes,
+                                        unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 5;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
+                                               const float nms_overlap_thresh,
+                                               const float *boxes,
+                                               unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 5;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif  // IOU3D_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..30889e728093195955a52b078e1ff433f8b0c4a5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,105 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+inline __device__ void swap_float(float *x, float *y) {
+  float tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+  int root = 0;
+  int child = root * 2 + 1;
+  while (child < k) {
+    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+    if (dist[root] > dist[child]) return;
+    swap_float(&dist[root], &dist[child]);
+    swap_int(&idx[root], &idx[child]);
+    root = child;
+    child = root * 2 + 1;
+  }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+  int i;
+  for (i = k - 1; i > 0; i--) {
+    swap_float(&dist[0], &dist[i]);
+    swap_int(&idx[0], &idx[i]);
+    reheap(dist, idx, i);
+  }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template <typename T>
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+                                        const T *xyz, const T *new_xyz,
+                                        int *__restrict__ idx, T *dist2) {
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    float best_dist[100];
+    int best_idx[100];
+    for (int i = 0; i < nsample; i++) {
+      best_dist[i] = 1e10;
+      best_idx[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+      T x = xyz[i * 3 + 0];
+      T y = xyz[i * 3 + 1];
+      T z = xyz[i * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < best_dist[0]) {
+        best_dist[0] = d2;
+        best_idx[0] = i;
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
+    }
+  }
+}
+
+#endif  // KNN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7331e59aa687e12d0c326458fbd452bbf308436b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,75 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..633b7b91b28beaeb547dc293a6d3e386e8a9e44c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,313 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+  float convex_points[2][MAXN];
+  for (int j = 0; j < n_points; j++) {
+    convex_points[0][j] = ps[j].x;
+  }
+  for (int j = 0; j < n_points; j++) {
+    convex_points[1][j] = ps[j].y;
+  }
+
+  Point edges[MAXN];
+  float edges_angles[MAXN];
+  float unique_angles[MAXN];
+  int n_edges = n_points - 1;
+  int n_unique = 0;
+  int unique_flag = 0;
+
+  for (int i = 0; i < n_edges; i++) {
+    edges[i].x = ps[i + 1].x - ps[i].x;
+    edges[i].y = ps[i + 1].y - ps[i].y;
+  }
+  for (int i = 0; i < n_edges; i++) {
+    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+    if (edges_angles[i] >= 0) {
+      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+    } else {
+      edges_angles[i] =
+          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+    }
+  }
+  unique_angles[0] = edges_angles[0];
+  n_unique += 1;
+  for (int i = 1; i < n_edges; i++) {
+    for (int j = 0; j < n_unique; j++) {
+      if (edges_angles[i] == unique_angles[j]) {
+        unique_flag += 1;
+      }
+    }
+    if (unique_flag == 0) {
+      unique_angles[n_unique] = edges_angles[i];
+      n_unique += 1;
+      unique_flag = 0;
+    } else {
+      unique_flag = 0;
+    }
+  }
+
+  float minarea = 1e12;
+  for (int i = 0; i < n_unique; i++) {
+    float R[2][2];
+    float rot_points[2][MAXN];
+    R[0][0] = cos(unique_angles[i]);
+    R[0][1] = -sin(unique_angles[i]);
+    R[1][0] = sin(unique_angles[i]);
+    R[1][1] = cos(unique_angles[i]);
+    // R x Points
+    for (int m = 0; m < 2; m++) {
+      for (int n = 0; n < n_points; n++) {
+        float sum = 0.0;
+        for (int k = 0; k < 2; k++) {
+          sum = sum + R[m][k] * convex_points[k][n];
+        }
+        rot_points[m][n] = sum;
+      }
+    }
+
+    // xmin;
+    float xmin, ymin, xmax, ymax;
+    xmin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] < xmin) {
+          xmin = rot_points[0][j];
+        }
+      }
+    }
+    // ymin
+    ymin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] < ymin) {
+          ymin = rot_points[1][j];
+        }
+      }
+    }
+    // xmax
+    xmax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] > xmax) {
+          xmax = rot_points[0][j];
+        }
+      }
+    }
+    // ymax
+    ymax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] > ymax) {
+          ymax = rot_points[1][j];
+        }
+      }
+    }
+    float area = (xmax - xmin) * (ymax - ymin);
+    if (area < minarea) {
+      minarea = area;
+      minbox[0] = unique_angles[i];
+      minbox[1] = xmin;
+      minbox[2] = ymin;
+      minbox[3] = xmax;
+      minbox[4] = ymax;
+    }
+  }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  // float sign;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point *j = &(in_poly[0]);
+      Point *k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+template <typename T>
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+  Point ps1[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = p[i * 2];
+    convex[i].y = p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  Jarvis(convex, n_convex);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = convex[i].x;
+    ps1[i].y = convex[i].y;
+  }
+  ps1[n1].x = convex[0].x;
+  ps1[n1].y = convex[0].y;
+
+  float minbbox[5] = {0};
+  minBoundingRect(ps1, n1 + 1, minbbox);
+  float angle = minbbox[0];
+  float xmin = minbbox[1];
+  float ymin = minbbox[2];
+  float xmax = minbbox[3];
+  float ymax = minbbox[4];
+  float R[2][2];
+
+  R[0][0] = cos(angle);
+  R[0][1] = -sin(angle);
+  R[1][0] = sin(angle);
+  R[1][1] = cos(angle);
+
+  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template <typename T>
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+                                              const T *ex_boxes, T *minbox) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T *cur_box = ex_boxes + index * 18;
+    T *cur_min_box = minbox + index * 8;
+    Findminbox(cur_box, cur_min_box);
+  }
+}
+
+#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,399 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4e59bd3dcd3c115e4152ebf771eda260b09236f3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,797 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+const int CUDA_NUM_THREADS = 1024;
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..caf777e8c1706f9422cb64d8288eab53b9b15a8a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,88 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float *dev_boxes,
+                         unsigned long long *dev_mask) {
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start) return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..80bed9681f748390999a2963bd3448570b0dbf6a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,135 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+                                        const float iou_threshold,
+                                        const T* dev_boxes,
+                                        unsigned long long* dev_mask,
+                                        const int multi_label) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 6 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+      block_boxes[threadIdx.x * 6 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+      block_boxes[threadIdx.x * 6 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+      block_boxes[threadIdx.x * 6 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+      block_boxes[threadIdx.x * 6 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+      block_boxes[threadIdx.x * 6 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 6;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 5;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include <cuda.h>
+
+#include <parrots/foundation/common.hpp>
+
+#ifdef PARROTS_USE_HALF
+#include <cuda_fp16.h>
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else  // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+                                      int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+                      return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+                                         unsigned delta, int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+                                          int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(float16 r;
+                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+                      return r;);
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template <typename T>
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+                                int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
+                                   int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
+                                     int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
+                                    int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..887293855df6cb611c62a6a4c02f92e787316944
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -0,0 +1,108 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
+#define POINT_IN_BOXES_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void points_in_boxes_part_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[0] = k;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void points_in_boxes_all_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+    T local_x = 0, local_y = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      const int cur_in_flag =
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[k] = 1;
+      }
+    }
+  }
+}
+
+#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5ecc1a5be38b960d8e60aba7eb77efd73ad41ffb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
@@ -0,0 +1,92 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+struct point {
+  float x, y;
+};
+
+template <typename scalar_t>
+__global__ void points_in_polygons_forward_cuda_kernel(
+    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
+    const int rows, const int cols, scalar_t *inside_flag) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int row = index / cols;
+    int col = index % cols;
+
+    const scalar_t *offset_vertex1 = vertex1 + row * 2;
+    const scalar_t *offset_vertex2 = vertex2 + col * 8;
+
+    point point_[1];
+    point polygon[4];
+
+    point_[0].x = offset_vertex1[0];
+    point_[0].y = offset_vertex1[1];
+
+    polygon[0].x = offset_vertex2[0];
+    polygon[0].y = offset_vertex2[1];
+    polygon[1].x = offset_vertex2[2];
+    polygon[1].y = offset_vertex2[3];
+    polygon[2].x = offset_vertex2[4];
+    polygon[2].y = offset_vertex2[5];
+    polygon[3].x = offset_vertex2[6];
+    polygon[3].y = offset_vertex2[7];
+
+    int nCross = 0;
+    int i, j;
+    float sx, sy, tx, ty, px, py, x;
+    for (i = 0, j = 3; i < 4; j = i, i++) {
+      sx = polygon[i].x;
+      sy = polygon[i].y;
+      tx = polygon[j].x;
+      ty = polygon[j].y;
+
+      px = point_[0].x;
+      py = point_[0].y;
+
+      if (py < min(sy, ty)) continue;
+      if (py > max(sy, ty)) continue;
+
+      if ((sx == px && sy == py) || (tx == px && ty == py)) {
+        break;
+      } else {
+        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
+          x = sx + (py - sy) * (tx - sx) / (ty - sy);
+          if (x == px) {
+            break;
+          }
+          if (x > px) {
+            nCross++;
+          }
+        }
+      }
+    }
+    if (nCross % 2 == 1) {
+      inside_flag[index] = 1.0;
+    } else {
+      inside_flag[index] = 0.0;
+    }
+    return;
+  }
+}
+
+#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..739fb4e623b0e75154b818a6e91c9c2c214e3349
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
@@ -0,0 +1,154 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,242 @@
+// Modified from
+// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
+#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int num_orientations, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+
+    const scalar_t *offset_bottom_data_plus =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        scalar_t val_plus = bilinear_interpolate<scalar_t>(
+            offset_bottom_data_plus, height, width, y, x, index);
+        output_val += r_var * val + l_var * val_plus;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+    scalar_t *offset_bottom_diff_plus =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    int top_offset =
+        (n * channels * num_orientations + c * num_orientations + o) *
+        pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
+
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
+                    g1 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
+                    g2 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
+                    g3 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
+                    g4 * l_var);
+
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RiRoIAlignBackward
+
+#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..53554c0980d3f4f60375c5e2d6ed330c4c2c470c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
@@ -0,0 +1,225 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void roi_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
+    const T* argmax_x, T* grad_input, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    if (pool_mode == 0) {
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+        }
+      }
+    } else if (pool_mode == 1) {
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {  // for backward-compatibility only
+        roi_width = max(roi_width, (T)1.);
+        roi_height = max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_height / pooled_height));
+      int roi_bin_grid_w =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_width / pooled_width));
+
+      // We do average (integral) pooling inside a bin
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            atomicAdd(offset_grad_input + y_low * width + x_low,
+                      grad_output_this_bin * w1 / count);
+            atomicAdd(offset_grad_input + y_low * width + x_high,
+                      grad_output_this_bin * w2 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_low,
+                      grad_output_this_bin * w3 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_high,
+                      grad_output_this_bin * w4 / count);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..33571f29674f53674415afe1bb4cc3ea0d8a9865
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,202 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sample_num, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..59d8f9175cebff324f73d070ce5794f862c6f18b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
@@ -0,0 +1,106 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    T roi_w = roi_x2 - roi_x1;
+    T roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    T bin_size_w = roi_w / static_cast<T>(pooled_width);
+    T bin_size_h = roi_h / static_cast<T>(pooled_height);
+
+    // the corresponding bin region
+    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    // Define an empty pooling region to be zero
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    T max_val = is_empty ? 0 : -FLT_MAX;
+    int max_idx = -1;
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (offset_input[offset] > max_val) {
+          max_val = offset_input[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    output[index] = max_val;
+    if (argmax != NULL) argmax[index] = max_idx;
+  }
+}
+
+template <typename T>
+__global__ void roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
+    T* grad_input, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled output
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    T* grad_input_offset =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    int argmax_index = argmax[index];
+
+    if (argmax_index != -1) {
+      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
+    }
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a6053ef4b62d0bfb33eeee39d96b64c7294cb527
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -0,0 +1,273 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const T *rois, const T *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
+  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
+  // y_idxs, z_idxs) by binary bit
+  int box_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
+
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0) {
+      T local_z = pts[2] - rois[2];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+
+      T x_res = x_size / out_x;
+      T y_res = y_size / out_y;
+      T z_res = z_size / out_z;
+
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int z_idx = int(local_z / z_res);
+
+      x_idx = min(max(x_idx, 0), out_x - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
+
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+
+      pts_mask[0] = idx_encoding;
+    }
+  }
+}
+
+template <typename T>
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             T *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+        unsigned int z_idx = idx_encoding & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                   y_idx * out_z * max_pts_each_voxel +
+                                   z_idx * max_pts_each_voxel;
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
+        if (cnt < max_num_pts) {
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
+          pts_idx_of_voxels[base_offset]++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+        argmax_idx = pts_idx_of_voxels[k];
+      }
+    }
+
+    if (argmax_idx != -1) {
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
+
+    if (total_pts > 0) {
+      pooled_features[0] = sum_val / total_pts;
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const T *grad_out, T *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    if (argmax[0] == -1) return;
+
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const T *grad_out, T *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+                grad_out[0] * cur_grad);
+    }
+  }
+}
+
+#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fdc9e99f79508c9130fa7924c9814701e6772535
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -0,0 +1,147 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
+                                    const T *xyz, const T *boxes3d,
+                                    int *pts_assign) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
+  // background points
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+
+    int assign_idx =
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                        local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+  }
+}
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
+                               int sampled_pts_num, const int *pts_assign,
+                               int *pts_idx, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_feature: (B, N, C)
+  // params pts_assign: (B, N)
+  // params pts_idx: (B, M, 512)
+  // params pooled_empty_flag: (B, M)
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                  boxes_idx * sampled_pts_num + cnt] = k;
+          cnt++;
+        } else
+          break;
+      }
+    }
+
+    if (cnt == 0) {
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    } else if (cnt < sampled_pts_num) {
+      // duplicate same points for sampling
+      for (int k = cnt; k < sampled_pts_num; k++) {
+        int duplicate_idx = k % cnt;
+        int base_offset =
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roipoint_pool3d_forward(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
+    T *pooled_features, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_idx: (B, M, 512)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+  }
+}
+
+#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c7de027df4cd0d3b9b4b260cca76315bf199e95c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
@@ -0,0 +1,142 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_forward_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_backward_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    atomicAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ab706a8a4c277058893b9d6268362b7807a48ba5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -0,0 +1,200 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
+#define SCATTER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+int const maxGridDim = 50000;
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef HIP_DIFF
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
+#else
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+#endif
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
+#else
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+#endif
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif  // __CUDA_ARCH__
+#endif  // HIP_DIFF
+
+template <typename T>
+__global__ void feats_reduce_kernel(
+    const T *feats, const int32_t *coors_map,
+    T *reduced_feats,  // shall be 0 at initialization
+    const int num_input, const int num_feats, const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c47c7c01e1b69b69500c81bdc6fd0671fc857564
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,84 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sigmoid_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* output, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + expf(-input[index]));
+
+    // (1 - p)**gamma * log(p)
+    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
+    // p**gamma * log(1 - p)
+    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
+
+    output[index] = (T)0.;
+    output[index] += -flag_p * alpha * term_p;
+    output[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      output[index] *= weight[t];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sigmoid_focal_loss_backward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* grad_input, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + exp(-input[index]));
+
+    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
+    T term_p = pow(((T)1. - p), gamma) *
+               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
+    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
+    T term_n = pow(p, gamma) *
+               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
+
+    grad_input[index] = (T)0.;
+    grad_input[index] += -flag_p * alpha * term_p;
+    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      grad_input[index] *= weight[t];
+    }
+  }
+}
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cbb465f0a27b10ad8ab47c36ed4ea7483d85e6ae
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,85 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void softmax_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* output, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      output[index] =
+          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
+    } else {
+      output[index] = 0;
+    }
+    if (weight != NULL) {
+      output[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda1_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* buff, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
+                             gamma * pow((T)1. - pred, gamma - 1) * pred *
+                                 log(max(pred, (T)FLT_MIN)));
+    } else {
+      buff[index] = 0;
+    }
+    if (weight != NULL) {
+      buff[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda2_kernel(
+    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
+    T* grad_input, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+    int64_t label = target[n];
+
+    if (label >= 0) {
+      T flag = (label == c ? (T)1. : (T)0.);
+      grad_input[index] = buff[n] * (flag - softmax[index]);
+    } else {
+      grad_input[index] = 0;
+    }
+  }
+}
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..869cf7e345c8d0bd1183848ddc46712891f84f13
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
@@ -0,0 +1,344 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e1cfdd17acebffee96b16fe3d985cc776c49cd81
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -0,0 +1,74 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
+#define THREE_INTERPOLATE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_interpolate_forward_cuda_kernel(
+    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
+    const T *weight, T *out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                  weight[2] * points[idx[2]];
+  }
+}
+
+template <typename T>
+__global__ void three_interpolate_backward_cuda_kernel(
+    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
+    const T *weight, T *grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
+}
+
+#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..061cf4d3b68f3929c71d7c21ced0f18ebfbff08b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -0,0 +1,80 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THREE_NN_CUDA_KERNEL_CUH
+#define THREE_NN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
+                                             const T *unknown, const T *known,
+                                             T *dist2, int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b) return;
+
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    T ux = unknown[0];
+    T uy = unknown[1];
+    T uz = unknown[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      T x = known[k * 3 + 0];
+      T y = known[k * 3 + 1];
+      T z = known[k * 3 + 2];
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
+  }
+}
+
+#endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..52dbd712cd1bda349a1d85123e8937b98476dfc1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
@@ -0,0 +1,74 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
+#define TIN_SHIFT_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void tin_shift_forward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void tin_shift_backward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+#endif  // TIN_SHIFT_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..07b6ed35d9387ad5c00e3f2e3b10c02fa2f9d87f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -0,0 +1,182 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#ifndef VOXELIZATION_CUDA_KERNEL_CUH
+#define VOXELIZATION_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      continue;
+    }
+
+    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      continue;
+    }
+
+    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if (coor_offset[0] == -1) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cpp_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72701890dd727db911a1c0ce4d6790c1b531348d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cpp_helper.hpp
@@ -0,0 +1,40 @@
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+#endif  // PARROTS_CPP_HELPER
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cuda_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..539009c3f91b46ea58a3a64f0875d799e8bd0b65
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/parrots_cuda_helper.hpp
@@ -0,0 +1,111 @@
+#ifndef PARROTS_CUDA_HELPER
+#define PARROTS_CUDA_HELPER
+
+#include <cuda.h>
+#include <float.h>
+
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/darray/mathfunctions.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darrayutil.hpp>
+#include <parrots/foundation/exceptions.hpp>
+#include <parrots/foundation/float16.hpp>
+#include <parrots/foundation/mathfunction.hpp>
+
+#include "common_cuda_helper.hpp"
+#include "parrots_cudawarpfunction.cuh"
+
+using namespace parrots;
+using phalf = float16;
+
+#define __PHALF(x) (x.y)
+
+#define PARROTS_CUDA_CHECK(exp)                         \
+  do {                                                  \
+    cudaError_t err = exp;                              \
+    if (err != cudaSuccess) {                           \
+      fprintf(stderr, "cudaCheckError() failed : %s\n", \
+              cudaGetErrorString(err));                 \
+      exit(-1);                                         \
+    }                                                   \
+  } while (0)
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+/** atomicAdd **/
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  if (val == 0.0) return __longlong_as_double(old);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#endif
+
+static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
+  unsigned int* aligned =
+      (unsigned int*)((size_t)address - ((size_t)address & 2));
+  unsigned int old = *aligned;
+  unsigned int assumed;
+  unsigned short old_as_us;
+  do {
+    assumed = old;
+    old_as_us =
+        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
+
+#if __CUDACC_VER_MAJOR__ >= 9
+    float16 tmp;
+    tmp.x = old_as_us;
+    float16 sum = tmp + val;
+    unsigned short sum_as_us = sum.x;
+//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
+//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
+#else
+    unsigned short sum_as_us =
+        __float2half_rn(__half2float(old_as_us) + (float)(val));
+#endif
+
+    unsigned int sum_as_ui = (size_t)address & 2
+                                 ? (sum_as_us << 16) | (old & 0xffff)
+                                 : (old & 0xffff0000) | sum_as_us;
+    old = atomicCAS(aligned, assumed, sum_as_ui);
+  } while (assumed != old);
+  //__half_raw raw = {old_as_us};
+  // return float16(raw);
+  return *reinterpret_cast<float16*>(&old_as_us);
+}
+#endif  // PARROTS_CUDA_HELPER
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cpp_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b812e62713e6c028b49a801a4319e2817e751519
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -0,0 +1,22 @@
+#ifndef PYTORCH_CPP_HELPER
+#define PYTORCH_CPP_HELPER
+#include <torch/extension.h>
+
+#include <vector>
+
+using namespace at;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) \
+  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_CUDA(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) \
+  CHECK_CPU(x);            \
+  CHECK_CONTIGUOUS(x)
+
+#endif  // PYTORCH_CPP_HELPER
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cuda_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9869b535f8a1de758b0c35612dbd4ac2a1701ad9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -0,0 +1,19 @@
+#ifndef PYTORCH_CUDA_HELPER
+#define PYTORCH_CUDA_HELPER
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include "common_cuda_helper.hpp"
+
+using at::Half;
+using at::Tensor;
+using phalf = at::Half;
+
+#define __PHALF(x) (x)
+
+#endif  // PYTORCH_CUDA_HELPER
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_device_registry.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_device_registry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a32b7270c3521f960394af7d18cbbd03ba50df1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/common/pytorch_device_registry.hpp
@@ -0,0 +1,141 @@
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/corner_pool.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/corner_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..50116b40fb6515244b1e613c85abb5e15b5e483d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/corner_pool.h
@@ -0,0 +1,59 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_CORNER_POOL_H
+#define ONNXRUNTIME_CORNER_POOL_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCornerPoolKernel {
+ public:
+  MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int64_t mode_;
+};
+
+struct MMCVCornerPoolCustomOp
+    : Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCornerPoolKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVCornerPool"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_CORNER_POOL_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/corner_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab636bf498d891512e90b955c27e12c9feaac9c5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/corner_pool.cpp
@@ -0,0 +1,136 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "corner_pool.h"
+
+#include "../ort_mmcv_utils.h"
+
+void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
+                       const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most bottom value from input to output
+        output[index_n_c + (height - 1) * width + w] =
+            input[index_n_c + (height - 1) * width + w];
+        // do top_pool
+        for (int h = height - 2; h >= 0; h--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h + 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void BottomPoolForwardCPU(const float *input, float *output,
+                          const int batch_size, const int channels,
+                          const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most top value from input to output
+        output[index_n_c + w] = input[index_n_c + w];
+        // do top_pool
+        for (int h = 1; h < height; h++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h - 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
+                        const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most right value from input to output
+        output[index_n_c + h * width + width - 1] =
+            input[index_n_c + h * width + width - 1];
+        // do left_pool
+        for (int w = width - 2; w >= 0; w--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w + 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void RightPoolForwardCPU(const float *input, float *output,
+                         const int batch_size, const int channels,
+                         const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most left value from input to output
+        output[index_n_c + h * width] = input[index_n_c + h * width];
+        // do right_pool
+        for (int w = 1; w < width; w++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w - 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
+  const int mode = int(mode_);
+  typedef float T;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const T *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
+
+  // get output memory
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  T *output_data = ort_.GetTensorMutableData<T>(output);
+
+  // 'top': 0, 'bottom': 1, 'left': 2, 'right':3
+  assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
+
+  // do corner_pool
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+  if (mode == 0)
+    TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                      input_height, input_width);
+  else if (mode == 1)
+    BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                         input_height, input_width);
+  else if (mode == 2)
+    LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                       input_height, input_width);
+  else
+    RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                        input_height, input_width);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd4149c92bb3b1fe67b2d2ae858aa0cef4db73b6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/deform_conv.cpp
@@ -0,0 +1,276 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
+                          const float *H, const int32_t trans_A,
+                          const int32_t trans_B, const int32_t M,
+                          const int32_t N, const int32_t K, const float alpha,
+                          const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+float bilinear_interpolate(const float *src, const int64_t src_h,
+                           const int64_t src_w, const float h, const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+void deformable_im2col(const float *input, const float *offset,
+                       const int64_t src_h, const int64_t src_w,
+                       const int64_t kernel_h, const int64_t kernel_w,
+                       const int64_t pad_h, const int64_t pad_w,
+                       const int64_t stride_h, const int64_t stride_w,
+                       const int64_t dilation_h, const int64_t dilation_w,
+                       const int64_t channels, const int64_t offset_groups,
+                       const int64_t dst_h, const int64_t dst_w,
+                       float *columns) {
+  const int64_t indices = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != indices; ++index) {
+    const int64_t w_col = index % dst_w;
+    const int64_t h_col = (index / dst_w) % dst_h;
+    const int64_t c_im = index / (dst_w * dst_h);
+    const int64_t c_col = c_im * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = c_im / c_per_offset_grp;
+    auto columns_ptr =
+        columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
+    auto input_ptr = input + c_im * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int data_offset_h_ptr =
+            ((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
+
+        const float offset_h = offset_ptr[data_offset_h_ptr];
+        const float offset_w = offset_ptr[data_offset_w_ptr];
+        const float ih =
+            (h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw =
+            (w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void deformable_conv_forward(
+    const float *src, const float *offset, const float *filter,
+    const int64_t batch, const int64_t src_c, const int64_t src_h,
+    const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
+    const int64_t dst_w, const int64_t group, const int64_t offset_group,
+    const int64_t channels, const int64_t num_output, const int64_t kernel_h,
+    const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
+    const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
+    const int64_t dilation_w, float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+
+      memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+
+      gemm_ref_fp32_deform(
+          filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
+          nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+          ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
+                                           const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch_size = input_dims[0];
+  int64_t in_channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t out_channels = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
+                                      out_width};
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+  deformable_conv_forward(
+      input_data, offset_data, filter_data, batch_size, in_channels, in_height,
+      in_width, out_channels, out_height, out_width, group, deformable_group,
+      in_channels, out_channels, kernel_height, kernel_width, stride_height,
+      stride_width, padding_height, padding_width, dilation_height,
+      dilation_width, columns, out_ptr);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/gridSample.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/gridSample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b76c14ce2e09a071c8907ae3c1d443800d977974
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -0,0 +1,327 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+#include "grid_sample.h"
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) \
+  out = MIN((clip_limit - 1), MAX(in, 0))
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
+
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           int64_t padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
+                                                         int64_t size,
+                                                         int64_t padding_mode,
+                                                         bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
+                                    int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
+                                         scalar_t y, int64_t W, int64_t H,
+                                         int64_t sW, int64_t sH,
+                                         int64_t padding_mode,
+                                         bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                   scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                      scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
+                                                     align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
+                                                     align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+          // assign nearest neighbor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW =
+                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+
+            // Interpolate 4 values in the x direction
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  tx);
+            }
+
+            // Interpolate in the y direction
+            *out_ptr_NCHW =
+                cubic_interp1d<float>(coefficients[0], coefficients[1],
+                                      coefficients[2], coefficients[3], ty);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..817785e6a5abefa65e8bea84e32e09a867affbf1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,305 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "modulated_deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+float bilinear_interpolate_2d(const float *src, const int64_t src_h,
+                              const int64_t src_w, const float h,
+                              const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
+void deformable_im2col_2d(const float *input, const float *offset,
+                          const float *mask, const int64_t src_h,
+                          const int64_t src_w, const int64_t kernel_h,
+                          const int64_t kernel_w, const int64_t pad_h,
+                          const int64_t pad_w, const int64_t stride_h,
+                          const int64_t stride_w, const int64_t dilation_h,
+                          const int64_t dilation_w, const int64_t channels,
+                          const int64_t offset_groups, const int64_t dst_h,
+                          const int64_t dst_w, const bool use_mask,
+                          float *columns) {
+  const int64_t workload = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != workload; ++index) {
+    const int64_t ow = index % dst_w;
+    const int64_t oh = (index / dst_w) % dst_h;
+    const int64_t ic = index / (dst_w * dst_h);
+    const int64_t oc = ic * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = ic / c_per_offset_grp;
+
+    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+    auto input_ptr = input + ic * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+    auto mask_ptr = mask;
+    if (use_mask) {
+      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+    }
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int64_t mask_idx = kh * kernel_w + kw;
+        const int64_t offset_idx = 2 * mask_idx;
+
+        float mask_value = 1;
+        if (use_mask) {
+          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        }
+
+        const float offset_h =
+            offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        const float offset_w =
+            offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = mask_value *
+                       bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void gemm_ref_fp32(const float *A, const float *B, const float *V,
+                   const float *H, const int32_t trans_A, const int32_t trans_B,
+                   const int32_t M, const int32_t N, const int32_t K,
+                   const float alpha, const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+void deformable_conv2d_ref_fp32(
+    const float *src, const float *offset, const float *mask,
+    const float *filter, const float *bias, const int64_t batch,
+    const int64_t src_c, const int64_t src_h, const int64_t src_w,
+    const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
+    const int64_t group, const int64_t offset_group, const int64_t channels,
+    const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
+    const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
+    const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
+    float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col_2d(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
+          src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          mask != nullptr, columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+      if (bias != nullptr) {
+        const float *bias_ptr = bias + g * oc_per_gp;
+        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
+          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
+            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+          }
+        }
+      } else {
+        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+      }
+      gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                    columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+                    ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
+    OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
+  const float *mask_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
+  const float *bias_data =
+      (bias != nullptr)
+          ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
+          : nullptr;
+  // const float *bias_data = nullptr;
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t num_output = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+
+  deformable_conv2d_ref_fp32(
+      input_data, offset_data, mask_data, filter_data, bias_data, batch,
+      channels, in_height, in_width, num_output, out_height, out_width, group,
+      deformable_group, channels, num_output, kernel_height, kernel_width,
+      stride_height, stride_width, padding_height, padding_width,
+      dilation_height, dilation_width, columns, out_ptr);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a582905ae627dd2c7e50e218f1d469d051b41aa3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/nms.cpp
@@ -0,0 +1,121 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>  // std::iota
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void NmsKernel::Compute(OrtKernelContext *context) {
+  const float iou_threshold = iou_threshold_;
+  const int64_t offset = offset_;
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const float *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const float *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
+  float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
+  for (int64_t i = 0; i < nboxes; i++) {
+    select[i] = true;
+  }
+
+  memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(float) * nboxes);
+
+  // sort scores
+  std::vector<float> tmp_sc;
+  for (int i = 0; i < nboxes; i++) {
+    tmp_sc.push_back(sc[i]);
+  }
+  std::vector<int64_t> order(tmp_sc.size());
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
+    return tmp_sc[id1] > tmp_sc[id2];
+  });
+
+  // area = (x2 - x1 + offset) * (y2 - y1 + offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
+               (tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
+  }
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = tmp_boxes[i * 4 + 0];
+    auto iy1 = tmp_boxes[i * 4 + 1];
+    auto ix2 = tmp_boxes[i * 4 + 2];
+    auto iy2 = tmp_boxes[i * 4 + 3];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
+      auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
+      auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
+      auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  std::vector<int64_t> res_order;
+  for (int i = 0; i < nboxes; i++) {
+    if (select[i]) {
+      res_order.push_back(order[i]);
+    }
+  }
+
+  std::vector<int64_t> inds_dims({res_order.size()});
+
+  OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
+                                               inds_dims.size());
+  int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4197d9981c451ad6299296da19af3d607e9bcef5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -0,0 +1,94 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "onnxruntime_register.h"
+
+#include "corner_pool.h"
+#include "deform_conv.h"
+#include "grid_sample.h"
+#include "modulated_deform_conv.h"
+#include "nms.h"
+#include "ort_mmcv_utils.h"
+#include "reduce_ops.h"
+#include "roi_align.h"
+#include "roi_align_rotated.h"
+#include "soft_nms.h"
+
+const char *c_MMCVOpDomain = "mmcv";
+SoftNmsOp c_SoftNmsOp;
+NmsOp c_NmsOp;
+MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
+MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+GridSampleOp c_GridSampleOp;
+MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
+MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
+MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
+MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
+MMCVDeformConvOp c_MMCVDeformConvOp;
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api) {
+  OrtCustomOpDomain *domain = nullptr;
+  const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
+
+  if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
+    return status;
+  }
+
+  return ortApi->AddCustomOpDomain(options, domain);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/reduce_ops.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c3fff62cabd6131f1fe2c3431373c96bcb23259
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
@@ -0,0 +1,201 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "reduce_ops.h"
+
+#include <assert.h>
+
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
+
+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
+  int64_t min = -ndims;
+  int64_t max = ndims - 1;
+  assert(dim >= min && dim <= max);
+  if (dim < 0) dim += ndims;
+  return dim;
+}
+
+static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
+                                     const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
+}
+
+static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
+                                   const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1
+             ? reversed_dim_cumprod[dim]
+             : reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
+}
+
+template <typename T1, typename T2, typename Operation>
+void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
+                          const int64_t input_dim_size, const int64_t stride) {
+  Operation op;
+  T1 out = input[0];
+  int64_t idx = 0;
+  for (int64_t i = 0; i < input_dim_size; i++) {
+    T1 curr_elem = input[i * stride];
+    if (op(curr_elem, out)) {
+      out = curr_elem;
+      idx = i;
+    }
+    output[i * stride] = out;
+    indices[i * stride] = idx;
+  }
+}
+
+// modified `tensor_dim_apply3` from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
+// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
+// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
+// and indices, since it's unnecessary to use separate values. currently
+// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
+// official pytorch projects: https://github.com/pytorch/pytorch.
+template <typename T1, typename T2, typename Function>
+void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
+                       const int64_t dim, const int64_t ndims,
+                       const int64_t *reversed_dim_cumprod, Function func) {
+  int dim_apply_finished = 0;
+  int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
+  // the same stride is used for input, output and indices
+  int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
+  std::vector<int64_t> counter(ndims, 0);
+
+  while (!dim_apply_finished) {
+    // call `func` once to update output and indices
+    func(input, output, indices, input_dim_size, stride);
+    if (ndims == 1) break;
+    for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          dim_apply_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+
+      // the same stride is used for input, output, and indices
+      int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
+      input += stride_dim_i;
+      output += stride_dim_i;
+      indices += stride_dim_i;
+
+      if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
+        if (dim_i == ndims - 1) {
+          dim_apply_finished = 1;
+          break;
+        } else {
+          input -= counter[dim_i] * stride_dim_i;
+          output -= counter[dim_i] * stride_dim_i;
+          indices -= counter[dim_i] * stride_dim_i;
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+      }  // if
+    }    // for
+  }      // while
+}
+
+template <typename T1, typename T2, typename Operation>
+void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
+                       int64_t *reversed_dim_cumprod, const int64_t dim,
+                       const OrtTensorDimensions &out_dimensions) {
+  // calculate numel
+  const int64_t ndims = out_dimensions.size();
+  int64_t numel = 1;
+  for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+    numel *= out_dimensions.data()[dim_i];
+  }
+
+  // cummax is only applied to input which is non-zero dim and non-empty
+  if (numel) {
+    // compute the cumulative production on dimension size,
+    // which is then used for computing the stride or size of a specific `dim`.
+    reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
+    for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
+      reversed_dim_cumprod[dim_i] =
+          reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
+    }
+
+    // do cummax or cummin based on `Operation` type
+    tensor_dim_apply3<float, int64_t>(
+        input, output, indices, dim, ndims, reversed_dim_cumprod,
+        cummax_cummin_helper<float, int64_t, Operation>);
+  }
+}
+
+void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
+
+void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24dd2397c897aaa31b09c33343748cbf50d9f699
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align.cpp
@@ -0,0 +1,278 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "roi_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+// implementation taken from Caffe2
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          float x = xx;
+          float y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignForwardCPU(const int nthreads, const float *input,
+                        const float *rois, float *output, float *argmax_y,
+                        float *argmax_x, const int pooled_height,
+                        const int pooled_width, const float spatial_scale,
+                        const int sampling_ratio,
+                        const int pool_mode,  // 0 - max pool, 1 - avg pool
+                        const bool aligned, const int channels,
+                        const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_start_w = offset_rois[1] * spatial_scale - offset;
+    float roi_start_h = offset_rois[2] * spatial_scale - offset;
+    float roi_end_w = offset_rois[3] * spatial_scale - offset;
+    float roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");*/
+      assert(roi_width >= 0 && roi_height >= 0);
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          float maxval = -10000;
+          float maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const float y = roi_start_h + ph * bin_size_h +
+                            static_cast<float>(iy + .5f) * bin_size_h /
+                                static_cast<float>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const float x = roi_start_w + pw * bin_size_w +
+                              static_cast<float>(ix + .5f) * bin_size_w /
+                                  static_cast<float>(roi_bin_grid_w);
+              PreCalc pc = pre_calc[pre_calc_index];
+              float val = pc.w1 * offset_input[pc.pos1] +
+                          pc.w2 * offset_input[pc.pos2] +
+                          pc.w3 * offset_input[pc.pos3] +
+                          pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+
+  int poolMod = 1;
+  if (pool_mode_ == "max") poolMod = 0;
+
+  float *argmax_x = nullptr, *argmax_y = nullptr;
+  if (poolMod == 0) {
+    argmax_y = new float[output_size];
+    argmax_x = new float[output_size];
+  }
+
+  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
+                     aligned_height_, aligned_width_, spatial_scale_,
+                     sampling_ratio_, poolMod, aligned_, input_channels,
+                     input_height, input_width);
+
+  if (argmax_x) delete argmax_x;
+  if (argmax_y) delete argmax_y;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d28f4d16620c91f02ec64aafaa5db8a9ee7f713a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
@@ -0,0 +1,258 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "roi_align_rotated.h"
+
+#include "../ort_mmcv_utils.h"
+
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
+    float roi_center_w, float cos_theta, float sin_theta,
+    std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
+                               const float *rois, float *output,
+                               const float &spatial_scale, const int aligned,
+                               const int clockwise, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_center_w = current_roi[1] * spatial_scale - offset;
+    float roi_center_h = current_roi[2] * spatial_scale - offset;
+    float roi_width = current_roi[3] * spatial_scale;
+    float roi_height = current_roi[4] * spatial_scale;
+    // float theta = current_roi[5] * M_PI / 180.0;
+    float theta = current_roi[5];  // Radian angle by default
+    if (clockwise) {
+      theta = -theta;
+    }
+    float cos_theta = cos(theta);
+    float sin_theta = sin(theta);
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    float roi_start_h = -roi_height / 2.0;
+    float roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
+                            aligned_, clockwise_, input_channels, input_height,
+                            input_width, aligned_height_, aligned_width_,
+                            sampling_ratio_);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/soft_nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/soft_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..faf99ab9274369650048bbb120999a7cbf9f39a5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/cpu/soft_nms.cpp
@@ -0,0 +1,169 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "soft_nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+
+SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
+  min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
+  method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void SoftNmsKernel::Compute(OrtKernelContext *context) {
+  typedef float T;
+
+  const T iou_threshold = T(iou_threshold_);
+  const T sigma = T(sigma_);
+  const T min_score = T(min_score_);
+  const int method = int(method_);
+  const T offset = T(offset_);
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const T *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const T *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
+  T *x1 = tmp_boxes;
+  T *y1 = tmp_boxes + 1;
+  T *x2 = tmp_boxes + 2;
+  T *y2 = tmp_boxes + 3;
+  T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
+  int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
+
+  memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(T) * nboxes);
+
+  // init inds as arange(nboxes)
+  std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
+
+  // area = (x2-x1+offset)*(y2-y1+offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] =
+        (x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
+  }
+
+  int64_t pos = 0;
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos * 4] = x1[i * 4];
+    y1[max_pos * 4] = y1[i * 4];
+    x2[max_pos * 4] = x2[i * 4];
+    y2[max_pos * 4] = y2[i * 4];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i * 4] = ix1;
+    y1[i * 4] = iy1;
+    x2[i * 4] = ix2;
+    y2[i * 4] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos * 4]);
+      auto yy1 = std::max(iy1, y1[pos * 4]);
+      auto xx2 = std::min(ix2, x2[pos * 4]);
+      auto yy2 = std::min(iy2, y2[pos * 4]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos * 4] = x1[(nboxes - 1) * 4];
+        y1[pos * 4] = y1[(nboxes - 1) * 4];
+        x2[pos * 4] = x2[(nboxes - 1) * 4];
+        y2[pos * 4] = y2[(nboxes - 1) * 4];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+
+  std::vector<int64_t> dets_dim({nboxes, 5});
+  OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
+                                                dets_dim.size());
+  T *dets_data = ort_.GetTensorMutableData<T>(dets);
+
+  std::vector<int64_t> inds_dim({nboxes});
+  OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
+                                                   inds_dim.size());
+  int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
+
+  memcpy(dets_data, de, sizeof(T) * nboxes * 5);
+  memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/deform_conv.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec83ff83860eaab4c7537efb7391a254da905fee
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/deform_conv.h
@@ -0,0 +1,70 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_DEFORM_CONV_H
+#define ONNXRUNTIME_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVDeformConvKernel {
+  MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+  int64_t im2col_step_;
+};
+
+struct MMCVDeformConvOp
+    : Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/grid_sample.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/grid_sample.h
new file mode 100644
index 0000000000000000000000000000000000000000..10a81706061d015c0b6fc769ffa29870ca3602aa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/grid_sample.h
@@ -0,0 +1,57 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct GridSampleKernel {
+  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
+
+  const char *GetName() const { return "grid_sampler"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/modulated_deform_conv.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/modulated_deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..105f34de0779d169a00a29a54e862bb2aebf82a2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/modulated_deform_conv.h
@@ -0,0 +1,74 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVModulatedDeformConvKernel {
+  MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+};
+
+struct MMCVModulatedDeformConvOp
+    : Ort::CustomOpBase<MMCVModulatedDeformConvOp,
+                        MMCVModulatedDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVModulatedDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 5; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    // The last input (index == 4) is optional, which is bias
+    if (index == 4)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/nms.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d621e0c718fc4e7b9625a4fb64ab239edbf3449
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/nms.h
@@ -0,0 +1,58 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_NMS_H
+#define ONNXRUNTIME_NMS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct NmsKernel {
+  NmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  int64_t offset_;
+};
+
+struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new NmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "NonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_register.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_register.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0c7b6b4675a66d13d552ba46fdbcf0098f86799
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_register.h
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_REGISTER_H
+#define ONNXRUNTIME_REGISTER_H
+#include <onnxruntime_c_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // ONNXRUNTIME_REGISTER_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e8dbf4bdb05ddc4ea1ecba11a9531f7f31db8d9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually
+ * for each Config. The maximum length of the Config Value is 1024
+ */
+
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise
+// prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking =
+    "session.disable_prepacking";
+
+// A value of "1" means allocators registered in the env will be used. "0" means
+// the allocators created in the session will be used. Use this to override the
+// usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators =
+    "session.use_env_allocators";
+
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename
+// ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat =
+    "session.load_model_format";
+
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
+// SessionOptions.optimized_model_path is set. If unset, format will default to
+// ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat =
+    "session.save_model_format";
+
+#endif  // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/ort_mmcv_utils.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/ort_mmcv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..18f4e2f2f67709aa77af16cdbcb440aa739894c0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/ort_mmcv_utils.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ORT_MMCV_UTILS_H
+#define ORT_MMCV_UTILS_H
+#include <onnxruntime_cxx_api.h>
+
+#include <vector>
+
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+#endif  // ORT_MMCV_UTILS_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/reduce_ops.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/reduce_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..83f53ad732f70a123383d69850aecfec30bd3d01
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/reduce_ops.h
@@ -0,0 +1,108 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_REDUCE_OPS_H
+#define ONNXRUNTIME_REDUCE_OPS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCumMaxKernel {
+ public:
+  MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMinKernel {
+ public:
+  MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMaxCustomOp
+    : Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMaxKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummax"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+struct MMCVCumMinCustomOp
+    : Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMinKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummin"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+#endif  // ONNXRUNTIME_REDUCE_OPS_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..3725d5952d586e00dd0d5adcab48d296f6a99243
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align.h
@@ -0,0 +1,75 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_ROI_ALIGN_H
+#define ONNXRUNTIME_ROI_ALIGN_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoiAlignKernel {
+ public:
+  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  std::string pool_mode_;
+  int aligned_;
+};
+
+struct MMCVRoiAlignCustomOp
+    : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoiAlignKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoiAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align_rotated.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a9be796babbc0301a9f41c5e1e9c9c6f0f81f35
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/roi_align_rotated.h
@@ -0,0 +1,75 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoIAlignRotatedKernel {
+ public:
+  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int aligned_;
+  int clockwise_;
+};
+
+struct MMCVRoIAlignRotatedCustomOp
+    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
+                        MMCVRoIAlignRotatedKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoIAlignRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoIAlignRotated"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/soft_nms.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/soft_nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..8764a87ba0d33bd13ed82b10491efa6bf1309708
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/onnxruntime/soft_nms.h
@@ -0,0 +1,62 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ONNXRUNTIME_SOFT_NMS_H
+#define ONNXRUNTIME_SOFT_NMS_H
+#include <onnxruntime_cxx_api.h>
+
+struct SoftNmsKernel {
+  SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  float sigma_;
+  float min_score_;
+  int64_t method_;
+  int64_t offset_;
+};
+
+struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new SoftNmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "SoftNonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) {
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif  // ONNXRUNTIME_SOFT_NMS_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43921f3ea061ed29e77b1f28ad801776f5455516
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter.cpp
@@ -0,0 +1,39 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a789a9312df234662622c48f4bada146b7d27f9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_parrots.cpp
@@ -0,0 +1,76 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "active_rotated_filter_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void active_rotated_filter_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+#endif
+
+void active_rotated_filter_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..19dee1a1e295950cb2c099b79a1839aba929bf86
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/active_rotated_filter_pytorch.h
@@ -0,0 +1,26 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
+#define ACTIVE_ROTATED_FILTER_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d387e0d557dbf705e3044c63406715456b62b5c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28ce1fd18a2e5e559e4514297c013024e575fb50
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_parrots.cpp
@@ -0,0 +1,102 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "assign_score_withk_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void assign_score_withk_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& centers = buildATensor(ctx, ins[1]);
+  const auto& scores = buildATensor(ctx, ins[2]);
+  const auto& knn_idx = buildATensor(ctx, ins[3]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,
+                             N1, M, K, O, aggregate);
+}
+
+void assign_score_withk_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& grad_out = buildATensor(ctx, ins[0]);
+  const auto& points = buildATensor(ctx, ins[1]);
+  const auto& centers = buildATensor(ctx, ins[2]);
+  const auto& scores = buildATensor(ctx, ins[3]);
+  const auto& knn_idx = buildATensor(ctx, ins[4]);
+
+  auto grad_points = buildATensor(ctx, outs[0]);
+  auto grad_centers = buildATensor(ctx, outs[1]);
+  auto grad_scores = buildATensor(ctx, outs[2]);
+  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,
+                              grad_points, grad_centers, grad_scores, B, N0, N1,
+                              M, K, O, aggregate);
+}
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_forward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(4)
+    .output(1)
+    .apply(assign_score_withk_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_backward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(5)
+    .output(3)
+    .apply(assign_score_withk_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d99b572f886067348ce3a58c4ec53f5881304370
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/assign_score_withk_pytorch.h
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H
+#define ASSIGN_SCORE_WITHK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query._parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query._parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd7c115cfd65d0d9d51dda08e1c81aa01b73c954
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query._parrots.cpp
@@ -0,0 +1,56 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "ball_query_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void ball_query_parrots(CudaContext& ctx, const SSElement& attr,
+                        const OperatorBase::in_list_t& ins,
+                        OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  float min_radius, max_radius;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .get<float>("min_radius", min_radius)
+      .get<float>("max_radius", max_radius)
+      .done();
+
+  const auto& center_xyz = buildATensor(ctx, ins[0]);
+  const auto& xyz = buildATensor(ctx, ins[1]);
+  auto idx = buildATensor(ctx, outs[0]);
+  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,
+                     nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(ball_query_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .attr("min_radius")
+    .attr("max_radius")
+    .input(2)
+    .output(1)
+    .apply(ball_query_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b11a93aebc625624d93bb78d4de34c77b040c44
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query.cpp
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc6efc938f73db823367393f33f245ee328b8f61
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ball_query_pytorch.h
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BALL_QUERY_PYTORCH_H
+#define BALL_QUERY_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,
+                        int b, int n, int m, float min_radius, float max_radius,
+                        int nsample);
+
+#endif  // BALL_QUERY_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e5fe114ed252789239bb5452db6333f6202ce13
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps.cpp
@@ -0,0 +1,27 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9d53e96e525cde42097576f0f96743671f22bc3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_parrots.cpp
@@ -0,0 +1,53 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "bbox_overlaps_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
+ * ious, const int mode, const bool aligned, const int offset);
+ */
+void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = buildATensor(ctx, ins[0]);
+  const auto& bboxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..24fc0f4f2c6e30f004f63fb2c22bb19597c7f7fe
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/bbox_overlaps_pytorch.h
@@ -0,0 +1,23 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BBOX_OVERLAPS_PYTORCH_H
+#define BBOX_OVERLAPS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+#endif  // BBOX_OVERLAPS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db2cd2b8c1a8e5ede253408614c782fde58bf67d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align.cpp
@@ -0,0 +1,43 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1b694507873c6c133c02ba6aaacb24ca76f54c3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_parrots.cpp
@@ -0,0 +1,64 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "border_align_pytorch.h"
+
+using namespace parrots;
+
+void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_idx = buildATensor(ctx, outs[1]);
+  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+  const auto& argmax_idx = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,
+                             pool_size);
+}
+
+PARROTS_EXTENSION_REGISTER(border_align_forward)
+    .attr("pool_size")
+    .input(2)
+    .output(2)
+    .apply(border_align_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(border_align_backward)
+    .attr("pool_size")
+    .input(3)
+    .output(1)
+    .apply(border_align_backward_cuda_parrots)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2508da2d98202ec4c7d9b17748642b2b8cb03e9b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/border_align_pytorch.h
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BORDER_ALIGN_PYTORCH_H
+#define BORDER_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+#endif
+
+#endif  // BORDER_ALIGN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d01c1db8d735df8a5f5ad5b34dfa0b17d1573dcb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated.cpp
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2fc5505715ea80e49acbd2c1d45e9d66df2dcf46
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_parrots.cpp
@@ -0,0 +1,74 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "box_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+}
+#endif
+
+PARROTS_EXTENSION_REGISTER(box_iou_rotated)
+    .attr("aligned")
+    .attr("mode_flag")
+    .input(2)
+    .output(1)
+    .apply(box_iou_rotated_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(box_iou_rotated_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..629b05f9ff3ccb75ae4549f4b8a5a255d3f8d47a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/box_iou_rotated_pytorch.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BOX_IOU_ROTATED_PYTORCH_H
+#define BOX_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
+
+#endif  // BOX_IOU_ROTATED_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b85ea8295f0dd0627a3a169f95f3c957684e39e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe.cpp
@@ -0,0 +1,51 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46443baf14496b362c493b1c7684eae06d460fcc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38816b60ed23d9e671d36f1ed9fff74be95c5862
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_parrots.cpp
@@ -0,0 +1,87 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_naive_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+ *                                int kernel_size, int group_size,
+ *                                int scale_factor)
+ */
+void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
+ * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
+ *                                int scale_factor);
+ */
+void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& features = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  auto mask_grad = buildATensor(ctx, outs[1]);
+  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..21de954cbc296281f6433828b9391b1ef876a519
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_naive_pytorch.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CARAFE_NAIVE_PYTORCH_H
+#define CARAFE_NAIVE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+#endif  // CARAFE_NAIVE_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27ab42ed49daa9b0b29d0c7a42cadbcb7d6c584e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_parrots.cpp
@@ -0,0 +1,101 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+ *                          Tensor routput, Tensor rmasks, Tensor output,
+ *                          int kernel_size, int group_size, int scale_factor);
+ */
+void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto rfeatures = buildATensor(ctx, outs[0]);
+  auto routput = buildATensor(ctx, outs[1]);
+  auto rmasks = buildATensor(ctx, outs[2]);
+  auto output = buildATensor(ctx, outs[3]);
+
+  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+/*
+ * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+ *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
+ *                           Tensor rbottom_grad, Tensor rmask_grad,
+ *                           Tensor bottom_grad, Tensor mask_grad, int
+ * kernel_size, int group_size, int scale_factor);
+ */
+void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& rfeatures = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto rtop_grad = buildATensor(ctx, outs[0]);
+  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
+  auto rbottom_grad = buildATensor(ctx, outs[2]);
+  auto rmask_grad = buildATensor(ctx, outs[3]);
+  auto bottom_grad = buildATensor(ctx, outs[4]);
+  auto mask_grad = buildATensor(ctx, outs[5]);
+
+  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8959b8c02f7f7ba9da300665c8ab1918a4a6e63a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/carafe_pytorch.h
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CARAFE_PYTORCH_H
+#define CARAFE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+#endif  // CARAFE_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4b3110d0c32c570d833934ecbb7d466dc8fde7c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand.cpp
@@ -0,0 +1,124 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  IntArrayRef label_map_shape = internal_kernel_label.sizes();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69f4ee314a4c3fc09a7929f29a612c42c75d1a59
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_parrots.cpp
@@ -0,0 +1,56 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "contour_expand_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void contour_expand_parrots(T& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int min_kernel_area, kernel_num;
+  SSAttrs(attr)
+      .get<int>("min_kernel_area", min_kernel_area)
+      .get<int>("kernel_num", kernel_num)
+      .done();
+  at::Tensor kernel_mask;
+  at::Tensor internal_kernel_label;
+  kernel_mask = buildATensor(ctx, ins[0]);
+  internal_kernel_label = buildATensor(ctx, ins[1]);
+  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                            kernel_num);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(contour_expand)
+    .attr("min_kernel_area")
+    .attr("kernel_num")
+    .input(2)
+    .output(1)
+    .apply(contour_expand_parrots<HostContext>)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..67341bdfbaa013d2af77fc7484f3659b4f1087d1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/contour_expand_pytorch.h
@@ -0,0 +1,25 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CONTOUR_EXPAND_PYTORCH_H
+#define CONTOUR_EXPAND_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+#endif  // CONTOUR_EXPAND_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8c7fe07cd721b49a146c2f4d208d5260b78b49f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou.cpp
@@ -0,0 +1,34 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7aba760191c3fbc70c74e981e8adebae52b09cc0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_parrots.cpp
@@ -0,0 +1,53 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "convex_iou_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  convex_iou(pointsets, polygons, ious);
+}
+
+void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  convex_giou(pointsets, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(convex_iou)
+    .input(2)
+    .output(1)
+    .apply(convex_iou_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(convex_giou)
+    .input(2)
+    .output(1)
+    .apply(convex_giou_forward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..14aff676ac19b8f7f7029462bfdb7c202fa5ff2d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/convex_iou_pytorch.h
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CONVEX_IOU_PYTORCH_H
+#define CONVEX_IOU_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e26488e622417e65dea7464d6f006672f27600b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool.cpp
@@ -0,0 +1,251 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3df4423386d128526f415cf5cfce71882c93f323
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_parrots.cpp
@@ -0,0 +1,247 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "corner_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+#endif
+
+void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_forward_parrots)
+#endif
+    .apply(bottom_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_backward_parrots)
+#endif
+    .apply(bottom_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_forward_parrots)
+#endif
+    .apply(top_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_backward_parrots)
+#endif
+    .apply(top_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_forward_parrots)
+#endif
+    .apply(left_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_backward_parrots)
+#endif
+    .apply(left_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_forward_parrots)
+#endif
+    .apply(right_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_backward_parrots)
+#endif
+    .apply(right_pool_backward_parrots_cpu)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..891e50235f874316b1245e744f2aa75cdfa72ef3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/corner_pool_pytorch.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CORNER_POOL_PYTORCH_H
+#define CORNER_POOL_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor bottom_pool_forward(at::Tensor input);
+at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor left_pool_forward(at::Tensor input);
+at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor right_pool_forward(at::Tensor input);
+at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor top_pool_forward(at::Tensor input);
+at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
+
+#endif  // CORNER_POOL_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1eaaefe3f9e2602f7daa28044d3ab7493672a469
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation.cpp
@@ -0,0 +1,60 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5469c48ef701a8ba54c55d7c4c16d5bd9d9d7d8c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_parrots.cpp
@@ -0,0 +1,189 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "correlation_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+#endif
+
+void correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+PARROTS_EXTENSION_REGISTER(correlation_forward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(2)
+    .output(1)
+    .apply(correlation_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(correlation_backward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(3)
+    .output(2)
+    .apply(correlation_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..33c921dae005381620b8e569b8b48241a443061b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/correlation_pytorch.h
@@ -0,0 +1,31 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CORRELATION_PYTORCH_H
+#define CORRELATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+#endif  // CORRELATION_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/cudabind.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6638103b40d5ccb3a4f3e91b6d7f869a0d388e12
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/cudabind.cpp
@@ -0,0 +1,1580 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou);
+
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long* mask, int boxes_num,
+                                       float nms_overlap_thresh);
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long* mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+};
+
+void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+};
+
+void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                          nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou);
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
+                     iou3d_boxes_iou_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
+                     iou3d_nms_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7332378e43581f51710f096eec96ed99820ad5f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv.cpp
@@ -0,0 +1,530 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a91d789d43d50b33197a131ebec4f926da4906fb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_parrots.cpp
@@ -0,0 +1,286 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+#endif
+
+void deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_input_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_parameters_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..96800287b63c445532303499460dbbf8207ddc43
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_conv_pytorch.h
@@ -0,0 +1,41 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DEFORM_CONV_PYTORCH_H
+#define DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+#endif  // DEFORM_CONV_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81bb7f48a8ad64a22c23e63b75db59757db511fc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool.cpp
@@ -0,0 +1,55 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32674dcb9188d4d5ba422ba57eb9297ad303e455
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_parrots.cpp
@@ -0,0 +1,115 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_roi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+ *                                  Tensor output, int pooled_height,
+ *                                  int pooled_width, float spatial_scale,
+ *                                  int sampling_ratio, float gamma);
+ */
+void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+ *                                   Tensor rois, Tensor offset,
+ *                                   Tensor grad_input, Tensor grad_offset,
+ *                                   int pooled_height, int pooled_width,
+ *                                   float spatial_scale, int sampling_ratio,
+ *                                   float gamma);
+ */
+void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& input = buildATensor(ctx, ins[1]);
+  const auto& rois = buildATensor(ctx, ins[2]);
+  const auto& offset = buildATensor(ctx, ins[3]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+  auto grad_offset = buildATensor(ctx, outs[1]);
+
+  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d52339b7e19c525f1f075835024800e16e5aec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/deform_roi_pool_pytorch.h
@@ -0,0 +1,31 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DEFORM_ROI_POOL_PYTORCH_H
+#define DEFORM_ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+#endif  // DEFORM_ROI_POOL_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..19fc4731e862ef0a85cd5dd775f9389056353e3f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss.cpp
@@ -0,0 +1,66 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6cbb3a0cfda78879dc45583fb7dec16386ce2d5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_parrots.cpp
@@ -0,0 +1,126 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "focal_loss_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto buff = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..430d6fa90aa34bb8355d70171bfcff8c04f645a0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/focal_loss_pytorch.h
@@ -0,0 +1,34 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FOCAL_LOSS_PYTORCH_H
+#define FOCAL_LOSS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+#endif  // FOCAL_LOSS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b0156ace75b486aad26f1578b726875fa8b8fcf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78a2807f60a75b487f6083c81b9a8a9eedd464dd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_parrots.cpp
@@ -0,0 +1,70 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "furthest_point_sample_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void furthest_point_sample_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,
+                                  m);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,
+                                            idx_tensor, b, n, m);
+}
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sample_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..46462fca6674261be282923872b8ef323a0c7ea0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/furthest_point_sample_pytorch.h
@@ -0,0 +1,27 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H
+#define FURTHEST_POINT_SAMPLE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e216d9fe5660359a15771f3b158cb65305f44eb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -0,0 +1,33 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ba7f16c63ae853fdbcdbfd9914a3a686983cef2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/fused_bias_parrots.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+                                   const torch::Tensor &bias,
+                                   const torch::Tensor &refer, int act,
+                                   int grad, float alpha, float scale);
+
+void fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int act, grad;
+  float alpha, scale;
+  SSAttrs(attr)
+      .get<int>("act", act)
+      .get<int>("grad", grad)
+      .get<float>("alpha", alpha)
+      .get<float>("scale", scale)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &bias = buildATensor(ctx, ins[1]);
+  const auto &refer = buildATensor(ctx, ins[2]);
+  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)
+    .attr("act")
+    .attr("grad")
+    .attr("alpha")
+    .attr("scale")
+    .input(3)
+    .output(1)
+    .apply(fused_bias_leakyrelu_parrots)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b947abffef407d8f15d7067dcff7332b8ca02605
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points.cpp
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8d296433a749ab7d16c6d6fd6452d2147609dea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_parrots.cpp
@@ -0,0 +1,84 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "gather_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,
+                        npoints);
+}
+
+void gather_points_backward_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                         n, npoints);
+}
+
+PARROTS_EXTENSION_REGISTER(gather_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(gather_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0df7b249c45d34f8456b723cb920759b34cc8ffb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/gather_points_pytorch.h
@@ -0,0 +1,26 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef GATHER_POINTS_PYTORCH_H
+#define GATHER_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+#endif  // GATHER_POINTS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5cbe4f21985074a29d4997b4c4f9834ffa516c41
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3806286480a6430c83743bcdd87b7e683c4503e5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_parrots.cpp
@@ -0,0 +1,85 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "group_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,
+                       nsample);
+}
+
+void group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                        n, npoints, nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(group_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(group_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cf6825f17356811e8cd6b68e4658fe7a282593f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/group_points_pytorch.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef GROUP_POINTS_PYTORCH_H
+#define GROUP_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+#endif  // GROUP_POINTS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/info.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa8b0941260818e7ab5b38541aee91b9961cd4c5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/info.cpp
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa258fbcd6b70ef525f3852febe96678c307dbca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d.cpp
@@ -0,0 +1,165 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
+                       boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
+}
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00f7b5ef8e3869acca921fd721f1f6b19cd194cb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_parrots.cpp
@@ -0,0 +1,83 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "iou3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void iou3d_boxes_iou_bev_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_a = buildATensor(ctx, ins[0]);
+  auto boxes_b = buildATensor(ctx, ins[1]);
+
+  auto ans_iou = buildATensor(ctx, outs[0]);
+
+  iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward)
+    .input(2)
+    .output(1)
+    .apply(iou3d_boxes_iou_bev_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms_normal_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..24b8ec589e9f1d8ffb16cc6fe825683d48d26ade
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/iou3d_pytorch.h
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef IOU_3D_PYTORCH_H
+#define IOU_3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou);
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh);
+
+#endif  // IOU_3D_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f5289096ef4239e925e2252393a78bc6eb881b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn.cpp
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..766aaaca74c55cfe3ffe781f55794aba951db62e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_parrots.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "knn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .done();
+
+  auto xyz_tensor = buildATensor(ctx, ins[0]);
+  auto new_xyz_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+  auto dist2_tensor = buildATensor(ctx, outs[1]);
+
+  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,
+              nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(knn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .input(2)
+    .output(2)
+    .apply(knn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2f1b2e5765b6ed4d47ebc146eb483d11acdfb29
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/knn_pytorch.h
@@ -0,0 +1,22 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KNN_PYTORCH_H
+#define KNN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+#endif  // KNN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f959825fead41b3d9587f4a9f8035b560a987b91
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8de3df55a7941383d075eae4c9460f4475c48d7c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_parrots.cpp
@@ -0,0 +1,85 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "masked_conv2d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto col = buildATensor(ctx, outs[0]);
+  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto im = buildATensor(ctx, outs[0]);
+  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3fb7351314f336c150eefe05f308baec0d7e75c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/masked_conv2d_pytorch.h
@@ -0,0 +1,28 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MASKED_CONV2D_PYTORCH_H
+#define MASKED_CONV2D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+#endif  // MASKED_CONV2D_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6b1993a56c48cc294d4acf63f1ea54c7d804ef0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons.cpp
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b707f7500b44652c8796c3872b2cd37769e1d24d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_parrots.cpp
@@ -0,0 +1,39 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4ca0a3eefce0fb8c77cb42c6fb50e27fbace844
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/min_area_polygons_pytorch.h
@@ -0,0 +1,22 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02212e9c958a30cdd3570215d5bd30c04bb3ac9e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -0,0 +1,250 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd68a65f083b1222128a766b7ad6722acb56521e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
@@ -0,0 +1,212 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "modulated_deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void modulated_deform_conv_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+#endif
+
+void modulated_deform_conv_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e25a47d24eb4e28ac19f520157119e5c432eada
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/modulated_deform_conv_pytorch.h
@@ -0,0 +1,34 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
+#define MODULATED_DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+#endif  // MODULATED_DEFORM_CONV_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..533c7bfa59e54d5b5798912a1ee1ac5d7150cfda
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn.cpp
@@ -0,0 +1,63 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c767874f171633ca88e8f45514cad6c4f91bf70d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/ms_deform_attn_parrots.cpp
@@ -0,0 +1,82 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+void ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,
+                                    sampling_loc, attn_weight, im2col_step);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  const auto &grad_output = buildATensor(ctx, ins[5]);
+  auto grad_value = buildATensor(ctx, outs[0]);
+  auto grad_sampling_loc = buildATensor(ctx, outs[1]);
+  auto grad_attn_weight = buildATensor(ctx, outs[2]);
+  ms_deform_attn_backward(value, spatial_shapes, level_start_index,
+                          sampling_loc, attn_weight, grad_output, grad_value,
+                          grad_sampling_loc, grad_attn_weight, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)
+    .attr("im2col_step")
+    .input(5)
+    .output(1)
+    .apply(ms_deform_attn_forward_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)
+    .attr("im2col_step")
+    .input(6)
+    .output(3)
+    .apply(ms_deform_attn_backward_parrots)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1f2f2afdf3dac708af564b243497e06a48232b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68b0f9f0779454c58d8d296a2401ff1d89ce86b4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_parrots.cpp
@@ -0,0 +1,153 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "nms_pytorch.h"
+
+using namespace parrots;
+
+// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+template <typename T>
+void nms_parrots(T& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  auto out = nms(boxes, scores, iou_threshold, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+ *                float sigma, float min_score, int method, int offset);*/
+template <typename T>
+void softnms_parrots(T& ctx, const SSElement& attr,
+                     const OperatorBase::in_list_t& ins,
+                     OperatorBase::out_list_t& outs) {
+  float iou_threshold, sigma, min_score;
+  int method, offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("sigma", sigma)
+      .get("min_score", min_score)
+      .get("method", method)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores, dets;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  dets = buildATensor(ctx, ins[2]);
+  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
+                     method, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
+template <typename T>
+void nms_match_parrots(T& ctx, const SSElement& attr,
+                       const OperatorBase::in_list_t& ins,
+                       OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
+  at::Tensor dets;
+  dets = buildATensor(ctx, ins[0]);
+  auto out = nms_match(dets, iou_threshold);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+ *                    const Tensor dets_sorted, const float iou_threshold,
+ *                                       const int multi_label);*/
+template <typename T>
+void nms_rotated_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int multi_label;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("multi_label", multi_label)
+      .done();
+  at::Tensor dets, scores, order, dets_sorted;
+  dets = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  order = buildATensor(ctx, ins[2]);
+  dets_sorted = buildATensor(ctx, ins[3]);
+  auto out =
+      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(nms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(softnms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(softnms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_match_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_rotated)
+    .attr("multi_label")
+    .attr("iou_threshold")
+    .input(4)
+    .output(1)
+    .apply(nms_rotated_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_rotated_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..858ec15ecc979d5c85915b1ac57e0b452d4e3ef9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_pytorch.h
@@ -0,0 +1,31 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef NMS_PYTORCH_H
+#define NMS_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
+               int offset);
+
+at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset);
+
+std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
+
+at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
+                       const at::Tensor order, const at::Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif  // NMS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f73d373335a98dc9c4e54888610ddefe3b17394
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/nms_rotated.cpp
@@ -0,0 +1,43 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa4f890e9e10e0e71528a7d5541e256f9de0048c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group.cpp
@@ -0,0 +1,38 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ce91ed0416250095093f6e0158e692eab68ed88
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_parrots.cpp
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "pixel_group_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void pixel_group_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  int kernel_region_num;
+  float distance_threshold;
+  SSAttrs(attr)
+      .get<int>("kernel_region_num", kernel_region_num)
+      .get<float>("distance_threshold", distance_threshold)
+      .done();
+  at::Tensor score;
+  at::Tensor mask;
+  at::Tensor embedding;
+  at::Tensor kernel_label;
+  at::Tensor kernel_contour;
+  score = buildATensor(ctx, ins[0]);
+  mask = buildATensor(ctx, ins[1]);
+  embedding = buildATensor(ctx, ins[2]);
+  kernel_label = buildATensor(ctx, ins[3]);
+  kernel_contour = buildATensor(ctx, ins[4]);
+  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                         kernel_region_num, distance_threshold);
+  int n = out.size();
+  std::vector<float> out_tensor;
+  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));
+  for (int i = 0; i < n; ++i)
+    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());
+  auto options = torch::TensorOptions().dtype(at::kFloat);
+  auto tensor = torch::zeros({1, out_tensor.size()}, options);
+  tensor.slice(0, 0, 1) =
+      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(pixel_group)
+    .attr("kernel_region_num")
+    .attr("distance_threshold")
+    .input(5)
+    .output(1)
+    .apply(pixel_group_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(pixel_group_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8a720475cbc43d461b031b766f9e0afccd4f5df
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/pixel_group_pytorch.h
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PIXEL_GROUP_PYTORCH_H
+#define PIXEL_GROUP_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+#endif  // PIXEL_GROUP_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28d8f52055e432e3a37ad7d9a421e780256dc179
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes.cpp
@@ -0,0 +1,58 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58d80a17d4fe39cd9c0da756411cb4ee942fe374
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_parrots.cpp
@@ -0,0 +1,77 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_boxes_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_boxes_part_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_part_forward(boxes_tensor, pts_tensor,
+                               box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_all_forward(boxes_tensor, pts_tensor,
+                              box_idx_of_points_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_part_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_all_forward_cuda_parrots)
+    .done();
+#endif
+
+void points_in_boxes_forward_cpu_parrots(HostContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto pts_indices_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_forward_cpu_parrots)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d9e66f8ba3650a914112f201d097f60c36069e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_boxes_pytorch.h
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef POINTS_IN_BOXES_PYTORCH_H
+#define POINTS_IN_BOXES_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+#endif  // POINTS_IN_BOXES_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b28614a25c07905909cb19b13b80de53fe255ef
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons.cpp
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ddcdcd5b80cad3b249c3b5302970bf0dc2c6ee48
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_parrots.cpp
@@ -0,0 +1,41 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cce6de6e7c4efacfb4f84e7469dd281133d5bd36
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/points_in_polygons_pytorch.h
@@ -0,0 +1,22 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a70f3f4049d9fa7d9ed02811482840ebb7a07a3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask.cpp
@@ -0,0 +1,52 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ec1dab48a821896df50d9aefe20742df7570684
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_parrots.cpp
@@ -0,0 +1,142 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "psamask_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                   const OperatorBase::in_list_t &ins,
+                                   OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+#endif
+
+void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                      h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..34be0b8412fb459578235d80181b24eab67013ec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/psamask_pytorch.h
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PSAMASK_PYTORCH_H
+#define PSAMASK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+#endif
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask);
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask);
+#endif  // PSAMASK_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce83fb3bb999649ba5e15192ce7a8e034e2ccd23
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated.cpp
@@ -0,0 +1,55 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccd4cf9c5dd86d2313cdb29da57698fefaaf2741
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
@@ -0,0 +1,99 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a30e01e0e2ce9f265927c1948a62b2a1461315c2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/riroi_align_rotated_pytorch.h
@@ -0,0 +1,31 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b24d11e40f66ad3a8392e285b4b499f1cd37dfc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a840dcbb5cbaac4381831b1e954b10212191372
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_parrots.cpp
@@ -0,0 +1,164 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
+#endif
+
+void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
+                        aligned_width, spatial_scale, sampling_ratio, pool_mode,
+                        aligned);
+}
+
+void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_forward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(2)
+    .output(3)
+    .apply(roi_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_backward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(4)
+    .output(1)
+    .apply(roi_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d288b172e4c83f665a491f98199f0d36d325ac21
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_pytorch.h
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROI_ALIGN_PYTORCH_H
+#define ROI_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+#endif
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+#endif  // ROI_ALIGN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9fcda084bf037efcd1da7a68e9b33a803779f2a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6046985bfd6ca87af564c9f37e94e0c12ded2196
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -0,0 +1,160 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sample_num", sample_num)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
+                                 pooled_width, spatial_scale, sample_num,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sample_num", sample_num)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
+                                  pooled_width, spatial_scale, sample_num,
+                                  aligned, clockwise);
+}
+#endif
+
+void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sample_num", sample_num)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
+                                pooled_width, spatial_scale, sample_num,
+                                aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sample_num", sample_num)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
+                                 pooled_width, spatial_scale, sample_num,
+                                 aligned, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sample_num")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sample_num")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a19581e619041f8af2e5681881c0ae862143324
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
+#define ROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sample_num,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
+                                     Tensor bottom_grad, int pooled_height,
+                                     int pooled_width, float spatial_scale,
+                                     int sample_num, bool aligned,
+                                     bool clockwise);
+#endif
+
+void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sample_num,
+                                   bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
+                                    Tensor bottom_grad, int pooled_height,
+                                    int pooled_width, float spatial_scale,
+                                    int sample_num, bool aligned,
+                                    bool clockwise);
+
+#endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56005da7a5024e580840f6956d437962b2a73d66
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool.cpp
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99aeeba70bf324b5e31876eea554509c7c13586c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_parrots.cpp
@@ -0,0 +1,80 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_pool_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax = buildATensor(ctx, outs[1]);
+  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax = buildATensor(ctx, ins[2]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(2)
+    .apply(roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(3)
+    .output(1)
+    .apply(roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..35bd076b3db49893c84e7a55f092a51cf742c43c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roi_pool_pytorch.h
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROI_POOL_PYTORCH_H
+#define ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+#endif
+#endif  // ROI_POOL_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c1fb652b00c87e713954840d1251aacb3d76611
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -0,0 +1,86 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47d99ed36f074283c8cb8909733cb84f2a6e7eec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
@@ -0,0 +1,71 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roiaware_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto rois = buildATensor(ctx, ins[0]);
+  auto pts = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto argmax = buildATensor(ctx, outs[0]);
+  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);
+  auto pooled_features = buildATensor(ctx, outs[2]);
+
+  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,
+                          pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);
+  auto argmax = buildATensor(ctx, ins[1]);
+  auto grad_out = buildATensor(ctx, ins[2]);
+
+  auto grad_in = buildATensor(ctx, outs[0]);
+
+  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,
+                           pool_method);
+}
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)
+    .attr("pool_method")
+    .input(3)
+    .output(3)
+    .apply(roiaware_pool3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)
+    .attr("pool_method")
+    .input(3)
+    .output(1)
+    .apply(roiaware_pool3d_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab5193213def623b27bf83fe91a3d0ff8e9c2b21
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roiaware_pool3d_pytorch.h
@@ -0,0 +1,27 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROIAWARE_POOL3D_PYTORCH_H
+#define ROIAWARE_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+#endif  // ROIAWARE_POOL3D_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b314148ff71946f7fb4be69a732ffb996813373
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6dafcd5a6dc50375c0e516fd040b625585ae9645
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roipoint_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  auto xyz = buildATensor(ctx, ins[0]);
+  auto boxes3d = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto pooled_features = buildATensor(ctx, outs[0]);
+  auto pooled_empty_flag = buildATensor(ctx, outs[1]);
+
+  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
+                          pooled_empty_flag);
+}
+
+PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
+    .input(3)
+    .output(2)
+    .apply(roipoint_pool3d_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4555693c6c3cf30d5ddc67504a74a0fb7b8e7439
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/roipoint_pool3d_pytorch.h
@@ -0,0 +1,23 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROIPOINT_POOL3D_PYTORCH_H
+#define ROIPOINT_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+#endif  // ROIPOINT_POOL3D_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c04a937ee0829012395b0302c4eec9a4a38c66ca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align.cpp
@@ -0,0 +1,50 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43c77a8bfb6dc3c6b512d4ab1f0961886b3f8278
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_parrots.cpp
@@ -0,0 +1,73 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "rotated_feature_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void rotated_feature_align_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ce999374cfe94dd3edaca46bb7362e43dc04773
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/rotated_feature_align_pytorch.h
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
+#define ROTATED_FEATURE_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dacb410d92f60be93f98a9aeccda05051700c342
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn.cpp
@@ -0,0 +1,82 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a257ec011f53e5e15e9da8bd66d51e95147d06c4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_parrots.cpp
@@ -0,0 +1,124 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "sync_bn_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  auto mean = buildATensor(ctx, outs[0]);
+  sync_bn_forward_mean_cuda(input, mean);
+}
+
+void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  auto var = buildATensor(ctx, outs[0]);
+  sync_bn_forward_var_cuda(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  size_t group_size;
+  float eps, momentum;
+  SSAttrs(attr)
+      .get<float>("eps", eps)
+      .get<float>("momentum", momentum)
+      .get<size_t>("group_size", group_size)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  const auto& var = buildATensor(ctx, ins[2]);
+  const auto& weight = buildATensor(ctx, ins[3]);
+  const auto& bias = buildATensor(ctx, ins[4]);
+  auto running_mean = buildATensor(ctx, outs[0]);
+  auto running_var = buildATensor(ctx, outs[1]);
+  auto norm = buildATensor(ctx, outs[2]);
+  auto std = buildATensor(ctx, outs[3]);
+  auto output = buildATensor(ctx, outs[4]);
+  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& norm = buildATensor(ctx, ins[1]);
+  auto grad_weight = buildATensor(ctx, outs[0]);
+  auto grad_bias = buildATensor(ctx, outs[1]);
+  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& grad_weight = buildATensor(ctx, ins[2]);
+  const auto& grad_bias = buildATensor(ctx, ins[3]);
+  const auto& norm = buildATensor(ctx, ins[4]);
+  const auto& std = buildATensor(ctx, ins[5]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
+    .input(1)
+    .output(1)
+    .apply(sync_bn_forward_mean_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
+    .input(2)
+    .output(1)
+    .apply(sync_bn_forward_var_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
+    .attr("eps")
+    .attr("momentum")
+    .attr("group_size")
+    .input(5)
+    .output(5)
+    .apply(sync_bn_forward_output_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
+    .input(2)
+    .output(2)
+    .apply(sync_bn_backward_param_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
+    .input(6)
+    .output(1)
+    .apply(sync_bn_backward_data_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d65a7c6f7e673c286c891059d7377afb64047f5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/sync_bn_pytorch.h
@@ -0,0 +1,39 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SYNC_BN_PYTORCH_H
+#define SYNC_BN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+#endif  // SYNC_BN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d93d709c86f2c51c342a7651dfdae126171805d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef5c85bee5fcdd532614a2c5f5ee1c82e756ed48
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_parrots.cpp
@@ -0,0 +1,87 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_interpolate_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int b, c, m, n;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("m", m)
+      .get<int>("n", n)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
+                            out_tensor, b, c, m, n);
+}
+
+void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int b, c, n, m;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
+                             grad_points_tensor, b, c, n, m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
+    .attr("b")
+    .attr("c")
+    .attr("m")
+    .attr("n")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("m")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8ef124dd5ed17f8e810f9b590ef12ea4dcc16e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_interpolate_pytorch.h
@@ -0,0 +1,27 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THREE_INTERPOLATE_PYTORCH_H
+#define THREE_INTERPOLATE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+#endif  // THREE_INTERPOLATE_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37e09d2575e74c29d4dff8753620716b13ed6f69
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn.cpp
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a31cccce4d42925c9896e4198434ea2e8ddda2f3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_parrots.cpp
@@ -0,0 +1,48 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_nn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto unknown_tensor = buildATensor(ctx, ins[0]);
+  auto known_tensor = buildATensor(ctx, ins[1]);
+
+  auto dist2_tensor = buildATensor(ctx, outs[0]);
+  auto idx_tensor = buildATensor(ctx, outs[1]);
+
+  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
+                   m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_nn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(2)
+    .apply(three_nn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae7a5f445f489fdec2f70fe61604cbb7bd65a1c9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/three_nn_pytorch.h
@@ -0,0 +1,23 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THREE_NN_PYTORCH_H
+#define THREE_NN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+#endif  // THREE_NN_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b9f649f75fad83759885497b992b59e48a38b4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift.cpp
@@ -0,0 +1,33 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b0b94bbd25e02c00dd1bdce9ed409592e6b1c8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_parrots.cpp
@@ -0,0 +1,52 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "tin_shift_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  tin_shift_forward_cuda(input, shift, output);
+}
+
+void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  tin_shift_backward_cuda(grad_output, shift, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(tin_shift_forward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(tin_shift_backward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7687b5e51c71c93e88e680df61ae9f3162ad1e4d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/tin_shift_pytorch.h
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TIN_SHIFT_PYTORCH_H
+#define TIN_SHIFT_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+#endif  // TIN_SHIFT_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a8165e0701d1d28b024453168cf112afcff9018
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d.cpp
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1950bf32240f9ab4fad40dd2ed3713c2628fd9ec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/upfirdn2d_parrots.cpp
@@ -0,0 +1,60 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,
+                        int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1);
+
+void upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,
+                       const OperatorBase::in_list_t &ins,
+                       OperatorBase::out_list_t &outs) {
+  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &kernel = buildATensor(ctx, ins[1]);
+  SSAttrs(attr)
+      .get("up_x", up_x)
+      .get("up_y", up_y)
+      .get("down_x", down_x)
+      .get("down_y", down_y)
+      .get("pad_x0", pad_x0)
+      .get("pad_x1", pad_x1)
+      .get("pad_y0", pad_y0)
+      .get("pad_y1", pad_y1)
+      .done();
+  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                       pad_x1, pad_y0, pad_y1);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(upfirdn2d)
+    .attr("up_x")
+    .attr("up_y")
+    .attr("down_x")
+    .attr("down_y")
+    .attr("pad_x0")
+    .attr("pad_x1")
+    .attr("pad_y0")
+    .attr("pad_y1")
+    .input(2)
+    .output(1)
+    .apply(upfirdn2d_parrots)
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d76887d9585b0ba055a0c078d4d9ee6be4441e05
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization.cpp
@@ -0,0 +1,69 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  *voxel_num_data = hard_voxelize_forward_impl(
+      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
+      max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_parrots.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c4838bd601ed94442d5b960865d80dafa5f1ef5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_parrots.cpp
@@ -0,0 +1,121 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "voxelization_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim);
+}
+
+void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+#endif
+
+void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim);
+}
+
+void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+
+PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
+    .attr("max_points")
+    .attr("max_voxels")
+    .attr("NDim")
+    .input(3)
+    .output(4)
+    .apply(hard_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(hard_voxelize_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)
+    .attr("NDim")
+    .input(3)
+    .output(1)
+    .apply(dynamic_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(dynamic_voxelize_forward_cuda_parrots)
+#endif
+    .done();
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_pytorch.h b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a74707b8f334c6cc7ec67d97a3fc6f5d94ce07ea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/parrots/voxelization_pytorch.h
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef VOXELIZATION_PYTORCH_H
+#define VOXELIZATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3);
+
+#endif  // VOXELIZATION_PYTORCH_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/active_rotated_filter.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..22925cfb027ac79d47f12df6d52f1cf96e163631
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/active_rotated_filter.cpp
@@ -0,0 +1,41 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/assign_score_withk.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d387e0d557dbf705e3044c63406715456b62b5c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/assign_score_withk.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ball_query.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b11a93aebc625624d93bb78d4de34c77b040c44
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ball_query.cpp
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/bbox_overlaps.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e5fe114ed252789239bb5452db6333f6202ce13
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -0,0 +1,27 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/border_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db2cd2b8c1a8e5ede253408614c782fde58bf67d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/border_align.cpp
@@ -0,0 +1,43 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/box_iou_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d01c1db8d735df8a5f5ad5b34dfa0b17d1573dcb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b85ea8295f0dd0627a3a169f95f3c957684e39e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe.cpp
@@ -0,0 +1,51 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe_naive.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46443baf14496b362c493b1c7684eae06d460fcc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/carafe_naive.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/contour_expand.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1d0573555132ce97e7ed558d7f553114e5aac86
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/contour_expand.cpp
@@ -0,0 +1,123 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/convex_iou.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8c7fe07cd721b49a146c2f4d208d5260b78b49f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/convex_iou.cpp
@@ -0,0 +1,34 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/corner_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e26488e622417e65dea7464d6f006672f27600b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/corner_pool.cpp
@@ -0,0 +1,251 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/correlation.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1eaaefe3f9e2602f7daa28044d3ab7493672a469
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/correlation.cpp
@@ -0,0 +1,60 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..866e4a03eef65f41e7c024d1a77d41484b92fc64
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -0,0 +1,131 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void active_rotated_filter_forward_cpu_kernel(
+    const T* weightData, const int* indicesData, const int num_output_planes,
+    const int num_input_planes, const int num_orientations, const int kH,
+    const int kW, const int num_rotations, T* outputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T val = *(weightData + weightIndex);
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          T* target = outputData +
+                      i * (num_rotations * num_input_planes * nEntry) +
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *target = val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void active_rotated_filter_backward_cpu_kernel(
+    const T* gradOutputData, const int* indicesData,
+    const int num_output_planes, const int num_input_planes,
+    const int num_orientations, const int kH, const int kW,
+    const int num_rotations, T* gradInputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T* val = gradInputData + gradInputIndex;
+        *val = 0;
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          const T* target =
+              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *val = *val + *target;
+        }
+      }
+    }
+  }
+}
+
+void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
+                                           const Tensor indices,
+                                           Tensor output) {
+  const int num_output_planes = input.size(0);
+  const int num_input_planes = input.size(1);
+  const int num_orientations = input.size(2);
+  const int kH = input.size(3);
+  const int kW = input.size(4);
+  const int num_rotations = indices.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
+        active_rotated_filter_forward_cpu_kernel<scalar_t>(
+            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, output.data_ptr<scalar_t>());
+      });
+}
+
+void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
+                                            const Tensor indices,
+                                            Tensor grad_in) {
+  const int num_orientations = indices.size(0);
+  const int kH = indices.size(1);
+  const int kW = indices.size(2);
+  const int num_rotations = indices.size(3);
+  const int num_output_planes = grad_out.size(0) / num_rotations;
+  const int num_input_planes = grad_out.size(1) / num_orientations;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
+        active_rotated_filter_backward_cpu_kernel<scalar_t>(
+            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, grad_in.data_ptr<scalar_t>());
+      });
+}
+
+void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
+}
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
+                     active_rotated_filter_forward_cpu);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
+                     active_rotated_filter_backward_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/box_iou_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a718ed0fc815c6c91c30b02605bf35acd05e9089
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
@@ -0,0 +1,49 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d416e49440fccc4ad9450e2c966b920a0c2302b1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/deform_conv.cpp
@@ -0,0 +1,421 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea8952cafd40689b08727055147a2c9cb8e08e77
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,449 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28195d1e7100a1ca39a355a15384c54a6e9efdba
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms.cpp
@@ -0,0 +1,243 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14dd2205f2d77c222251d991b4d8b591ac1b981a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -0,0 +1,77 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/pixel_group.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8ecb8fb9c5596577e5396889bcbf595f3cc3f07
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -0,0 +1,137 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding_dim.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < embedding_dim; i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/points_in_boxes.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6054bdb145983860d5ccbea09aec867be12e03cf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/points_in_boxes.cpp
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/psamask.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..327819341319005e0de1a7f06f91f8168e3baf73
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/psamask.cpp
@@ -0,0 +1,212 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75237bbd31edc7603de7a02efd98894fbf318602
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align.cpp
@@ -0,0 +1,477 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : ceilf(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceilf(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9da4f19a4eadc29ae6e63cc34a60a12482a15cf2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -0,0 +1,466 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // ROIAlignRotatedBackward
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      });
+}
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/voxelization.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df00ec92434753bb0e0df59fdc79ecfb29d9413f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -0,0 +1,183 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    if (failed)
+      memset(&coors[i][0], -1, NDim * sizeof(T_int));
+    else
+      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
+  }
+
+  delete[] coor;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      memcpy(&voxels[voxelidx][num][0], &points[i][0],
+             num_features * sizeof(T));
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6390e2c57b04120675e5825e790db24d1ca2cbb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -0,0 +1,71 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#include "active_rotated_filter_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output) {
+  int num_output_planes = input.size(0);
+  int num_input_planes = input.size(1);
+  int num_orientations = input.size(2);
+  int kH = input.size(3);
+  int kW = input.size(4);
+  int num_rotations = indices.size(3);
+  int nEntry = num_orientations * kH * kW;
+  int output_size = input.numel();
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
+        active_rotated_filter_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                output.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in) {
+  int num_orientations = indices.size(0);
+  int kH = indices.size(1);
+  int kW = indices.size(2);
+  int num_rotations = indices.size(3);
+  int num_output_planes = grad_out.size(0) / num_rotations;
+  int num_input_planes = grad_out.size(1) / num_orientations;
+  int nEntry = num_orientations * kH * kW;
+  int output_size = grad_in.numel();
+
+  at::cuda::CUDAGuard device_guard(indices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
+      [&] {
+        active_rotated_filter_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_out.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                grad_in.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdb5fab9fc61ad19d9230cfdc26642dc7fe5972e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assign_score_withk_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
+        assign_score_withk_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
+                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
+  dim3 threads1(THREADS_PER_BLOCK);
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 threads2(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_points_backward_cuda_kernel<scalar_t>
+            <<<blocks1, threads1, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
+                grad_points.data_ptr<scalar_t>(),
+                grad_centers.data_ptr<scalar_t>());
+      });
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
+            <<<blocks2, threads2, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4dc909e6b8832bf811efb5b38697d2fc604d4964
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -0,0 +1,51 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
+        ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, n, m, min_radius, max_radius, nsample,
+                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
+                idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34969cd7ac49a05b61065a4b8f0684c381fb5c79
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -0,0 +1,36 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/border_align_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/border_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..de83cb34092ba18d6b95dc36f95e013fed130880
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/border_align_cuda.cu
@@ -0,0 +1,81 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "border_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size) {
+  // shape assertion
+  AT_ASSERTM(input.ndimension() == 4,
+             "non-empty 4D(batch mode) tensor expected for input feature");
+  AT_ASSERTM(boxes.ndimension() == 3,
+             "boxes must be 3D tensor with size of [B, H*W, 4]");
+
+  int batch_size = input.size(0);
+  int feat_channels = input.size(1);
+  int channels = feat_channels / 4;
+  int height = input.size(2);
+  int width = input.size(3);
+  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
+  int box_size = boxes.size(1);
+  // shape [N, channels, box_size, 4] for output
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
+        border_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, input.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
+                pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size) {
+  int batch_size = grad_input.size(0);
+  int feat_channels = grad_input.size(1);
+  int channels = feat_channels / 4;
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  int box_size = boxes.size(1);
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
+        border_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, grad_output.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
+                width, pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c13e06237b208a48e2489ef8246c90ada78ef51
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#include "box_iou_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_rotated_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b37295c29a422f51d6b23d3e37015554cfee75bc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_cuda.cu
@@ -0,0 +1,193 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..724476c23d181d090afb3e8e1fe6de5de1b3f8d0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
@@ -0,0 +1,65 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/convex_iou.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/convex_iou.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f3e9bfe2e4c1b0517097bfe2217a1c9f85b4ecaa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/convex_iou.cu
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
+#include "convex_iou_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious) {
+  int output_size = ious.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
+        convex_iou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output) {
+  int output_size = output.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
+        convex_giou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/correlation_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/correlation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56e948c2035055655c950359838783c4f84a9db1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -0,0 +1,106 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#include "correlation_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+
+  const int threads = THREADS_FORWARD;
+  const dim3 blocks(batch_size, oH, oW);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_forward_cuda", ([&] {
+        TensorAcc4R trInput1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R trInput2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R output_acc =
+            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
+                padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
+
+void CorrelationBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
+    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+
+  const dim3 blocks(C, iH, iW);
+  const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        TensorAcc4R input1_acc =
+            input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R input2_acc =
+            input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input1_acc =
+            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input2_acc =
+            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R grad_output_acc =
+            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input1<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
+
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input2<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
+      }));
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/cudabind.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6638103b40d5ccb3a4f3e91b6d7f869a0d388e12
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -0,0 +1,1580 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou);
+
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long* mask, int boxes_num,
+                                       float nms_overlap_thresh);
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long* mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+};
+
+void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+};
+
+void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                          nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou);
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
+                     iou3d_boxes_iou_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
+                     iou3d_nms_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_conv_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6da43389fed6ca39fd6ea50bf2bb8ede93b2122f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -0,0 +1,118 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a20909ff4de5fc26e13c33a4343160af899644f4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
@@ -0,0 +1,68 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/focal_loss_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ff598ab26b2653296bca89593cc905957654af1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
@@ -0,0 +1,124 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfb4cd3646fa181de8bf61df33526c99dfdf5522
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -0,0 +1,143 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "furthest_point_sample_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, 1024), 1);
+}
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
+  // dataset: (B, N, N)
+  // temp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..911ea019aad65c8e51ca94c273cb5bbad70ae8db
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
@@ -0,0 +1,109 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(
+    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
+    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
+    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+  scalar_t zero = 0.0;
+
+  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
+       loop_idx++, xi += blockDim.x) {
+    scalar_t x = p_x[xi];
+
+    if (use_bias) {
+      x += p_b[(xi / step_b) % size_b];
+    }
+
+    scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+    scalar_t y;
+
+    // act = 1: linear layer
+    // act = 3: leaky relu layer
+    // grad = 0: direct forward path
+    // grad = 1: first order deviation
+    // grad = 2: second order deviation
+    switch (act * 10 + grad) {
+      default:
+      case 10:
+        y = x;
+        break;
+      case 11:
+        y = x;
+        break;
+      case 12:
+        y = 0.0;
+        break;
+
+      case 30:
+        y = (x > 0.0) ? x : x * alpha;
+        break;
+      case 31:
+        y = (ref > 0.0) ? x : x * alpha;
+        break;
+      case 32:
+        y = 0.0;
+        break;
+    }
+
+    out[xi] = y * scale;
+  }
+}
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  auto x = input.contiguous();
+  auto b = bias.contiguous();
+  auto ref = refer.contiguous();
+
+  int use_bias = b.numel() ? 1 : 0;
+  int use_ref = ref.numel() ? 1 : 0;
+
+  int size_x = x.numel();
+  int size_b = b.numel();
+  int step_b = 1;
+
+  for (int i = 1 + 1; i < x.dim(); i++) {
+    step_b *= x.size(i);
+  }
+
+  int loop_x = 4;
+  int block_size = 4 * 32;
+  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+  auto y = torch::empty_like(x);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
+            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
+      });
+
+  return y;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/gather_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a7b5daf03510cfb7408ff82cfac760af92afb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gather_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
+  //      out: (B, C, npoints)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
+        gather_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points) {
+  // grad_out: (B, C, npoints)
+  // idx: (B, npoints)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
+        gather_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/group_points_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..300a0925384c7b66659c203e37039a1d4922445d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -0,0 +1,74 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "group_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
+        group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
+        group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/iou3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf930c79acc3bb68d8c9e7f2af26bdb5c5fa135a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -0,0 +1,86 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+
+#include "iou3d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_overlap.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_iou.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long *mask, int boxes_num,
+                                       float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long *mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/knn_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/knn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9174b70ddb9e49f34730213eeb13e7dc14ad5a45
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -0,0 +1,47 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include <cmath>
+#include <cstdio>
+
+#include "knn_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2) {
+  // param new_xyz: (B, m, 3)
+  // param xyz: (B, n, 3)
+  // param idx: (B, m, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
+        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
+            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+            dist2.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e88cd360277a0854f56f79bf6c7308215bb50fb5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/min_area_polygons.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/min_area_polygons.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e64ef0caecc71f9e2c9edddd760a018ba29ca29a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/min_area_polygons.cu
@@ -0,0 +1,34 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
+#include "min_area_polygons_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
+                                       Tensor polygons) {
+  int num_pointsets = pointsets.size(0);
+  const int output_size = polygons.numel();
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
+        min_area_polygons_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                num_pointsets, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9614f5565bb691f39dd5acd284527897d4bd6e4c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -0,0 +1,109 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fccaa2132d5c041b02835756e3da9313c22a158
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,361 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <vector>
+
+#include "ms_deform_attn_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      1024>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data_ptr<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              grad_output_g.data_ptr<scalar_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a8182d0c1a2861a1e0ff5c029ebf247597d72f4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep_t =
+      at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
+  bool* keep = keep_t.data_ptr<bool>();
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep[i] = true;
+      // set every overlap box with bit 1 in remv
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t.to(at::kCUDA));
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1185f81cb2fd58d00a30d3fff5215af76f57a85
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#include "nms_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc89d010a80126360fe42503a1754ef4a420afa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -0,0 +1,62 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "points_in_boxes_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
+        points_in_boxes_part_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
+        points_in_boxes_all_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dfd98b9a11b1ae57b967b12dce6701f8b2d265d3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
@@ -0,0 +1,41 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
+
+#include <stdio.h>
+
+#include "points_in_polygons_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output) {
+  const int output_size = rows * cols;
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
+        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
+        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
+        scalar_t *inside_flag = output.data_ptr<scalar_t>();
+
+        points_in_polygons_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, vertex1, vertex2, rows, cols, inside_flag);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/psamask_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/psamask_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8df347525e6790a4b3fc9baa6460982faff3061d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -0,0 +1,76 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+
+#include <THC/THCDeviceUtils.cuh>
+
+#include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b89b78817c3a4cda395ede9c9724db2748299b8e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
@@ -0,0 +1,66 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "riroi_align_rotated_cuda_kernel.cuh"
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        riroi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                num_samples, clockwise, channels, height, width, pooled_height,
+                pooled_width, num_orientations, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        riroi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_diff, rois_data, spatial_scale, num_samples,
+                clockwise, channels, height, width, pooled_height, pooled_width,
+                num_orientations, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e849747d36baf5a6a12ef69f9f1ec7e669561bd9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_cuda.cu
@@ -0,0 +1,71 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cbc417183155323cb712d39dc56f400cf3f48a1a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -0,0 +1,58 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_rotated_cuda_kernel.cuh"
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        roi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sample_num, aligned, clockwise, channels, height, width,
+                pooled_height, pooled_width, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        roi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                aligned, clockwise, channels, height, width, pooled_height,
+                pooled_width, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_pool_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..55a51c6e95425d53f328800d59c2951ad3e8a86f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
@@ -0,0 +1,63 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "roi_pool_cuda_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
+        roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax.data_ptr<int>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
+        roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d83755f4c89104a037cb7c16a59e6dd25f84e12
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roiaware_pool3d_cuda_kernel.cuh"
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
+  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
+  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
+  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
+  // max_pool 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(pts_feature.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  Tensor pts_mask =
+      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
+
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
+        generate_pts_mask_for_box3d<scalar_t>
+            <<<blocks_mask, threads, 0, stream>>>(
+                boxes_num, pts_num, out_x, out_y, out_z,
+                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
+                pts_mask.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
+        collect_inside_pts_for_box3d<scalar_t>
+            <<<blocks_collect, threads, 0, stream>>>(
+                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
+                pts_mask.data_ptr<int>(),
+                pts_idx_of_voxels.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
+          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
+          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
+          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
+              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
+          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
+              grad_in.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2098e8229ef29c08fe3c8d715863fe67cda06e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -0,0 +1,60 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roipoint_pool3d_cuda_kernel.cuh"
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
+                                boxes3d.options().dtype(at::kInt));
+
+  at::cuda::CUDAGuard device_guard(xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
+        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
+            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
+      });
+
+  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
+                             boxes3d.options().dtype(at::kInt));
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
+
+  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
+      batch_size, pts_num, boxes_num, sampled_pts_num,
+      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
+      pooled_empty_flag.data_ptr<int>());
+
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+                   batch_size);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
+        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
+            pts_feature.data_ptr<scalar_t>(),
+            pooled_features.data_ptr<scalar_t>(),
+            pooled_empty_flag.data_ptr<int>());
+      });
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5d06b524fa6a07f2a9aff72d2dc45033c1d62494
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
@@ -0,0 +1,66 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cuda_helper.hpp"
+#include "rotated_feature_align_cuda_kernel.cuh"
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
+      ([&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, bottom_data, bboxes_data,
+                scalar_t(spatial_scale), features.size(1), features.size(2),
+                features.size(3), top_data);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad) {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
+      ([&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, top_diff, bboxes_data,
+                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
+                top_grad.size(3), bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5fca8a4e722e97c50c6b556f04baa524fe7e5d80
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -0,0 +1,140 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include <stdio.h>
+#include <stdlib.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "scatter_points_cuda_kernel.cuh"
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  // the first element of out_coors is always (-1,-1,-1) and should be removed
+  out_coors = out_coors.slice(0, 1);
+  reduce_count = reduce_count.slice(0, 1).to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32) - 1;
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+        if (reduce_type == reduce_t::MAX)
+          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+        else
+          reduced_feats.fill_(static_cast<scalar_t>(0));
+
+        dim3 blocks(std::min(
+            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+        dim3 threads(THREADS_PER_BLOCK);
+        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
+            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
+            reduce_type);
+        if (reduce_type == reduce_t::MEAN)
+          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
+                                                    stream>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(
+              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
+                       maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/sync_bn_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e279c93b1dedbe409044fe34c01efaaee8f09bf6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
@@ -0,0 +1,123 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), num, channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), running_mean.data_ptr<float>(),
+                running_var.data_ptr<float>(), weight.data_ptr<float>(),
+                bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
+                channels, spatial, eps, momentum, group_size);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias) {
+  int num = grad_output.size(0);
+  int channels = grad_output.size(1);
+  int spatial = grad_output.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
+                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input) {
+  int output_size = grad_input.numel();
+  int num = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int spatial = grad_input.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
+                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56a5550066035efb96d1d8e46c5f1ecd3e36083b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_interpolate_cuda_kernel.cuh"
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight,
+                                               Tensor out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
+        three_interpolate_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
+        three_interpolate_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_nn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..91c68829b9f2c19f1a64def88475c0fedf40de9f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -0,0 +1,35 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_nn_cuda_kernel.cuh"
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  at::cuda::CUDAGuard device_guard(unknown.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
+        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
+            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/tin_shift_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ae643b317e179bd59edfdeab08383ec5a3942e1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
@@ -0,0 +1,68 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#include "tin_shift_cuda_kernel.cuh"
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output) {
+  int output_size = output.numel();
+  int batch_size = input.size(0);
+  int t_size = input.size(1);
+  int channels = input.size(2);
+  int hw_size = input.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
+        tin_shift_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
+                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
+                hw_size, group_size, group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input) {
+  int output_size = grad_output.numel();
+  int batch_size = grad_output.size(0);
+  int t_size = grad_output.size(1);
+  int channels = grad_output.size(2);
+  int hw_size = grad_output.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
+        tin_shift_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
+                batch_size, channels, t_size, hw_size, group_size,
+                group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea2f08820023cea60bdefe8aae56b0f303c72ffa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -0,0 +1,370 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+
+  if (c * b > a) {
+    c--;
+  }
+
+  return c;
+}
+
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
+
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
+        }
+
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
+      }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
+    }
+  }
+}
+
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
+
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
+    }
+
+    sk[ky][kx] = v;
+  }
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
+      __syncthreads();
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
+      }
+
+      __syncthreads();
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
+#pragma unroll
+        for (int y = 0; y < kernel_h / up_y; y++)
+#pragma unroll
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
+        }
+      }
+    }
+  }
+}
+
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    switch (mode) {
+      case 1:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 2:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 3:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 4:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 5:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 6:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      default:
+        upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(), p);
+    }
+  });
+
+  return out;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/voxelization_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..61968dc9e098fbc44641bd987c4a9a330b1d8c6c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -0,0 +1,201 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "voxelization_cuda_kernel.cuh"
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
+            temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+            max_voxels, num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determine voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
+                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
+                              num_points_per_voxel.contiguous().data_ptr<int>(),
+                              point_to_voxelidx.contiguous().data_ptr<int>(),
+                              point_to_pointidx.contiguous().data_ptr<int>(),
+                              coor_to_voxelidx.contiguous().data_ptr<int>(),
+                              voxel_num.contiguous().data_ptr<int>(),
+                              max_points, max_voxels, num_points);
+                        }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
+            pts_output_size, points.contiguous().data_ptr<float>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxels.contiguous().data_ptr<float>(), max_points, num_features,
+            num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int>
+            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
+                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
+  dim3 blocks(col_blocks);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7332378e43581f51710f096eec96ed99820ad5f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_conv.cpp
@@ -0,0 +1,530 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_roi_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81bb7f48a8ad64a22c23e63b75db59757db511fc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -0,0 +1,55 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/focal_loss.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..19fc4731e862ef0a85cd5dd775f9389056353e3f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/focal_loss.cpp
@@ -0,0 +1,66 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/furthest_point_sample.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b0156ace75b486aad26f1578b726875fa8b8fcf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/fused_bias_leakyrelu.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e216d9fe5660359a15771f3b158cb65305f44eb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -0,0 +1,33 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/gather_points.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b947abffef407d8f15d7067dcff7332b8ca02605
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/gather_points.cpp
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/group_points.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96243f35f5826025c3890ed5b7ceddfb9936f241
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/group_points.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/info.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa8b0941260818e7ab5b38541aee91b9961cd4c5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,67 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/iou3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8a868b92c1f94fd8931a81c5426f177b871f0ea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/iou3d.cpp
@@ -0,0 +1,159 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
+                       boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
+}
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/knn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f5289096ef4239e925e2252393a78bc6eb881b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/knn.cpp
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/masked_conv2d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f959825fead41b3d9587f4a9f8035b560a987b91
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/masked_conv2d.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/min_area_polygons.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6b1993a56c48cc294d4acf63f1ea54c7d804ef0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/min_area_polygons.cpp
@@ -0,0 +1,24 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/modulated_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02212e9c958a30cdd3570215d5bd30c04bb3ac9e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -0,0 +1,250 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ms_deform_attn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a5ca7b3ac85ec523e916a22653502f668f8b6e0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,64 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1f2f2afdf3dac708af564b243497e06a48232b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms.cpp
@@ -0,0 +1,46 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f73d373335a98dc9c4e54888610ddefe3b17394
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/nms_rotated.cpp
@@ -0,0 +1,43 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pixel_group.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa4f890e9e10e0e71528a7d5541e256f9de0048c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pixel_group.cpp
@@ -0,0 +1,38 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_boxes.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28d8f52055e432e3a37ad7d9a421e780256dc179
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_boxes.cpp
@@ -0,0 +1,58 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_polygons.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b28614a25c07905909cb19b13b80de53fe255ef
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/points_in_polygons.cpp
@@ -0,0 +1,29 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/psamask.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a70f3f4049d9fa7d9ed02811482840ebb7a07a3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/psamask.cpp
@@ -0,0 +1,52 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pybind.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pybind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02f107fa0aeeeb91393a1a91b101ae19cf1b00e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/pybind.cpp
@@ -0,0 +1,771 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+
+std::string get_compiler_version();
+std::string get_compiling_cuda_version();
+
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
+                                const Tensor &scores, const Tensor &knn_idx,
+                                Tensor &output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
+                                 const Tensor &centers, const Tensor &scores,
+                                 const Tensor &knn_idx, Tensor &grad_points,
+                                 Tensor &grad_centers, Tensor &grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor);
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor);
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor);
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma);
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma);
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha);
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha);
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset);
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou);
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh);
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w);
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels);
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset);
+
+std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width, float spatial_scale);
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale);
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size);
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input);
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask);
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask);
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample);
+
+Tensor bottom_pool_forward(Tensor input);
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor left_pool_forward(Tensor input);
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor right_pool_forward(Tensor input);
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor top_pool_forward(Tensor input);
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output);
+
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned);
+
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label);
+
+Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
+                 int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
+                 int pad_y1);
+
+Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
+                            const Tensor &refer, int act, int grad, float alpha,
+                            float scale);
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sample_num,
+                               bool aligned, bool clockwise);
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sample_num, bool aligned, bool clockwise);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type);
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type);
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim);
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size);
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
+        py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
+        py::arg("down_y"), py::arg("pad_x0"), py::arg("pad_x1"),
+        py::arg("pad_y0"), py::arg("pad_y1"));
+  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
+        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
+        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
+        py::arg("scale"));
+  m.def("gather_points_forward", &gather_points_forward,
+        "gather_points_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("gather_points_backward", &gather_points_backward,
+        "gather_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
+        "get_compiling_cuda_version");
+  m.def("assign_score_withk_forward", &assign_score_withk_forward,
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("aggregate"));
+  m.def("assign_score_withk_backward", &assign_score_withk_backward,
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
+        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
+        py::arg("O"), py::arg("aggregate"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
+  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
+        py::arg("features"), py::arg("masks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_naive_backward", &carafe_naive_backward,
+        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
+        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_forward", &carafe_forward, "carafe_forward",
+        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
+        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_backward", &carafe_backward, "carafe_backward",
+        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
+        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
+        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
+        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
+        py::arg("scale_factor"));
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
+        py::arg("input"), py::arg("weight"), py::arg("offset"),
+        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_input", &deform_conv_backward_input,
+        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
+        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
+        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
+        py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
+        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
+        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
+        py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
+        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("scale"), py::arg("im2col_step"));
+  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
+        "deform roi pool forward", py::arg("input"), py::arg("rois"),
+        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
+        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
+        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
+        py::arg("grad_offset"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
+        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
+        py::arg("pts_feature"), py::arg("pooled_features"),
+        py::arg("pooled_empty_flag"));
+  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
+        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
+        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
+        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
+        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
+        py::arg("gamma"), py::arg("alpha"));
+  m.def("three_interpolate_forward", &three_interpolate_forward,
+        "three_interpolate_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
+  m.def("three_interpolate_backward", &three_interpolate_backward,
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
+        py::arg("m"));
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
+        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
+        py::arg("aligned"), py::arg("offset"));
+  m.def("group_points_forward", &group_points_forward, "group_points_forward",
+        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
+        py::arg("nsample"));
+  m.def("group_points_backward", &group_points_backward,
+        "group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"));
+  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
+        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
+        py::arg("boxes_b"), py::arg("ans_overlap"));
+  m.def("iou3d_boxes_iou_bev_forward", &iou3d_boxes_iou_bev_forward,
+        "iou3d_boxes_iou_bev_forward", py::arg("boxes_a"), py::arg("boxes_b"),
+        py::arg("ans_iou"));
+  m.def("iou3d_nms_forward", &iou3d_nms_forward, "iou3d_nms_forward",
+        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
+        py::arg("nms_overlap_thresh"));
+  m.def("iou3d_nms_normal_forward", &iou3d_nms_normal_forward,
+        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
+        py::arg("num_out"), py::arg("nms_overlap_thresh"));
+  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("furthest_point_sampling_with_dist_forward",
+        &furthest_point_sampling_with_dist_forward,
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("masked_im2col_forward", &masked_im2col_forward,
+        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
+  m.def("masked_col2im_forward", &masked_col2im_forward,
+        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
+        py::arg("width"), py::arg("channels"));
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
+        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
+        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
+        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("with_bias"));
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
+        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
+        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
+        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
+        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
+        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
+  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
+        py::arg("iou_threshold"), py::arg("offset"));
+  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
+        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
+        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
+        py::arg("offset"));
+  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
+        py::arg("iou_threshold"));
+  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
+        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
+        py::arg("kernel_contour"), py::arg("kernel_region_label"),
+        py::arg("distance_threshold"));
+  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
+        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
+        py::arg("min_kernel_area"), py::arg("kernel_num"));
+  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
+        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
+        py::arg("grad_input"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"));
+  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
+        py::arg("input"), py::arg("mean"));
+  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
+        py::arg("input"), py::arg("mean"), py::arg("var"));
+  m.def("sync_bn_forward_output", &sync_bn_forward_output,
+        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
+        py::arg("var"), py::arg("weight"), py::arg("bias"),
+        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
+        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
+        py::arg("group_size"));
+  m.def("sync_bn_backward_param", &sync_bn_backward_param,
+        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
+        py::arg("grad_weight"), py::arg("grad_bias"));
+  m.def("sync_bn_backward_data", &sync_bn_backward_data,
+        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
+        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
+        py::arg("std"), py::arg("grad_input"));
+  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
+        py::arg("input"), py::arg("output"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
+        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
+        py::arg("input"), py::arg("shift"), py::arg("output"));
+  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
+        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
+        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
+        py::arg("iou_threshold"), py::arg("multi_label"));
+  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
+        py::arg("max_radius"), py::arg("nsample"));
+  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
+        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
+        py::arg("clockwise"));
+  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
+        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
+        py::arg("grad_output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
+        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
+        py::arg("reduce_type"));
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
+        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
+        py::arg("grad_reduced_feats"), py::arg("feats"),
+        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
+        py::arg("reduce_type"));
+  m.def("hard_voxelize_forward", &hard_voxelize_forward,
+        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
+        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"));
+  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
+        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
+        "forward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("im2col_step"));
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
+        "backward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("grad_output"),
+        py::arg("grad_value"), py::arg("grad_sampling_loc"),
+        py::arg("grad_attn_weight"), py::arg("im2col_step"));
+  m.def("border_align_forward", &border_align_forward,
+        "forward function of border_align", py::arg("input"), py::arg("boxes"),
+        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
+  m.def("border_align_backward", &border_align_backward,
+        "backward function of border_align", py::arg("grad_output"),
+        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
+        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
+        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
+  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
+        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
+        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
+        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
+        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
+        py::arg("pooled_features"), py::arg("pool_method"));
+  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
+        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
+        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
+        py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/riroi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce83fb3bb999649ba5e15192ce7a8e034e2ccd23
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/riroi_align_rotated.cpp
@@ -0,0 +1,55 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b24d11e40f66ad3a8392e285b4b499f1cd37dfc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align_rotated.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9fcda084bf037efcd1da7a68e9b33a803779f2a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -0,0 +1,54 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56005da7a5024e580840f6956d437962b2a73d66
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roi_pool.cpp
@@ -0,0 +1,44 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roiaware_pool3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c1fb652b00c87e713954840d1251aacb3d76611
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roiaware_pool3d.cpp
@@ -0,0 +1,86 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roipoint_pool3d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..994011e1471249c084bf5d73610ff9f7ef669041
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/roipoint_pool3d.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/rotated_feature_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c04a937ee0829012395b0302c4eec9a4a38c66ca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/rotated_feature_align.cpp
@@ -0,0 +1,50 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/scatter_points.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/scatter_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..85cd9fae011f0505d169cbc04f208e6257cc7b74
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/scatter_points.cpp
@@ -0,0 +1,66 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
+                              reduce_type);
+}
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
+                       grad_reduced_feats, feats, reduced_feats, coors_idx,
+                       reduce_count, reduce_type);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else
+    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type) {
+  return dynamic_point_to_voxel_forward_impl(feats, coors,
+                                             convert_reduce_type(reduce_type));
+}
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type) {
+  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
+                                       reduced_feats, coors_idx, reduce_count,
+                                       convert_reduce_type(reduce_type));
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/sync_bn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dacb410d92f60be93f98a9aeccda05051700c342
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/sync_bn.cpp
@@ -0,0 +1,82 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_interpolate.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d93d709c86f2c51c342a7651dfdae126171805d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_interpolate.cpp
@@ -0,0 +1,45 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_nn.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37e09d2575e74c29d4dff8753620716b13ed6f69
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/three_nn.cpp
@@ -0,0 +1,30 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/tin_shift.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b9f649f75fad83759885497b992b59e48a38b4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/tin_shift.cpp
@@ -0,0 +1,33 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/upfirdn2d.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a8165e0701d1d28b024453168cf112afcff9018
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/upfirdn2d.cpp
@@ -0,0 +1,32 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/voxelization.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d76887d9585b0ba055a0c078d4d9ee6be4441e05
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/pytorch/voxelization.cpp
@@ -0,0 +1,69 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License..
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  *voxel_num_data = hard_voxelize_forward_impl(
+      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
+      max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47e7882df7dabdab6841e81d7d18202db429b612
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
@@ -0,0 +1,230 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_corner_pool.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CORNER_POOL_PLUGIN_NAME{"MMCVCornerPool"};
+}  // namespace
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string &name,
+                                                 TRT_CORNER_POOL_TYPE poolType)
+    : mLayerName(name), mPoolType(poolType) {}
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string name,
+                                                 const void *data,
+                                                 size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mPoolType);
+}
+
+CornerPoolPluginDynamic::~CornerPoolPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CornerPoolPluginDynamic::clone() const {
+  CornerPoolPluginDynamic *plugin =
+      new CornerPoolPluginDynamic(mLayerName, mPoolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CornerPoolPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CornerPoolPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    default:
+      return false;
+  }
+}
+
+void CornerPoolPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CornerPoolPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CornerPoolPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+
+  const int batch_size = inputDesc[0].dims.d[0];
+  const int channels = inputDesc[0].dims.d[1];
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+
+  CornerPoolForwardLauncher_float((float *)input, (float *)output_value,
+                                  batch_size, channels, height, width,
+                                  int(mPoolType), stream);
+
+  return 0;
+}
+
+nvinfer1::DataType CornerPoolPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *CornerPoolPluginDynamic::getPluginType() const {
+  switch (mPoolType) {
+    case TRT_CORNER_POOL_TYPE::TRT_TOP_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL:
+      return CORNER_POOL_PLUGIN_NAME;
+
+    default:
+      return "UnknownpoolType";
+  }
+}
+
+const char *CornerPoolPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CornerPoolPluginDynamic::getNbOutputs() const { return 1; }
+
+int CornerPoolPluginDynamic::initialize() { return 0; }
+
+void CornerPoolPluginDynamic::terminate() {}
+
+size_t CornerPoolPluginDynamic::getSerializationSize() const {
+  return sizeof(mPoolType);
+}
+
+void CornerPoolPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mPoolType);
+}
+
+void CornerPoolPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CornerPoolPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CornerPoolPluginDynamicCreator::CornerPoolPluginDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginName() const {
+  return CORNER_POOL_PLUGIN_NAME;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CornerPoolPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  TRT_CORNER_POOL_TYPE poolType;
+  int poolMode = -1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("mode") == 0) {
+      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  assert(poolMode >= 0 && poolMode <= 3);
+  switch (poolMode) {
+    case 0:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_TOP_POOL;
+      break;
+    case 1:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL;
+      break;
+    case 2:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL;
+      break;
+    case 3:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL;
+      break;
+
+    default:
+      break;
+  }
+
+  CornerPoolPluginDynamic *plugin = new CornerPoolPluginDynamic(name, poolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CornerPoolPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CornerPoolPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..edf13a187c5720df8e97405f3c534074660065dd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -0,0 +1,123 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void top_bottom_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * width;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * width);  // batch
+    int w_idx = index % width;               // width
+    int c_idx = (index / width) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int direction = -1;            // in [-1, 1], default for TopPool
+    int index_start = height - 2;  // default for TopPool
+    // pool_type in [0, 1]
+    if (pool_type == 0) {
+      // TopPool
+      // directly copy the most bottom value from input to output
+      output[offset_n_c + (height - 1) * width + w_idx] =
+          input[offset_n_c + (height - 1) * width + w_idx];
+    } else {
+      // BottomPool
+      // directly copy the most top value from input to output
+      output[offset_n_c + w_idx] = input[offset_n_c + w_idx];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int h = index_start; h >= 0 && h < height; h += direction) {
+      output[offset_n_c + h * width + w_idx] =
+          max(output[offset_n_c + (h - direction) * width + w_idx],
+              input[offset_n_c + h * width + w_idx]);
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void left_right_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * height;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * height);  // batch
+    int h_idx = index % height;               // height
+    int c_idx = (index / height) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int offset_n_c_h = offset_n_c + h_idx * width;
+    int direction = -1;           // in [-1, 1], default for LeftPool
+    int index_start = width - 2;  // default for LeftPool
+    // pool_type in [2, 3]
+    if (pool_type == 2) {
+      // LeftPool
+      // directly copy the most right value from input to output
+      output[offset_n_c_h + width - 1] = input[offset_n_c_h + width - 1];
+    } else {
+      // RightPool
+      // directly copy the most left value from input to output
+      output[offset_n_c_h] = input[offset_n_c_h];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int w = index_start; w >= 0 && w < width; w += direction) {
+      output[offset_n_c_h + w] =
+          max(output[offset_n_c_h + w - direction], input[offset_n_c_h + w]);
+    }
+  }
+}
+
+template <typename scalar_t>
+void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
+                               const int batch_size, const int channels,
+                               const int height, const int width,
+                               const int pool_type, cudaStream_t stream) {
+  int nthreads = -1, col_block = -1;
+
+  switch (pool_type) {
+    case 0:
+    case 1:
+      nthreads = batch_size * channels * width;
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
+      top_bottom_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+    case 2:
+    case 3:
+      nthreads = batch_size * channels * height;
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
+      left_right_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+  }
+}
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream) {
+  CornerPoolForwardLauncher<float>(input, output, batch_size, channels, height,
+                                   width, pool_type, stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81f064593f1e744b148560d8896e6a05d4118e9e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
@@ -0,0 +1,104 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cublas_v2.h>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <class scalar_t>
+__global__ void copy_permute_kernel(scalar_t *dst, const scalar_t *src, int n,
+                                    TensorDesc ts_src_stride,
+                                    TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute) {
+  const int src_dim = ts_src_stride.dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *permute = &(ts_permute.shape[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    size_t dst_index = index;
+    size_t src_index = 0;
+    for (int i = 0; i < src_dim; ++i) {
+      int dim_index = dst_index / dst_stride[i];
+      dst_index = dst_index % dst_stride[i];
+      src_index += dim_index * src_stride[permute[i]];
+    }
+    dst[index] = src[src_index];
+  }
+}
+
+template <class scalar_t>
+void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
+                   int *permute, int src_dim, cudaStream_t stream) {
+  size_t copy_size = 1;
+  TensorDesc ts_permute;
+  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+
+  TensorDesc ts_src_stride;
+  TensorDesc ts_dst_stride;
+  ts_src_stride.dim = src_dim;
+  ts_dst_stride.dim = src_dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *dst_size = &(ts_dst_stride.shape[0]);
+  src_stride[src_dim - 1] = 1;
+  dst_stride[src_dim - 1] = 1;
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    dst_size[i] = src_size[permute[i]];
+    if (i < src_dim - 1) {
+      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    }
+  }
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    copy_size *= dst_size[i];
+    if (i < src_dim - 1) {
+      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    }
+  }
+
+  copy_permute_kernel<scalar_t>
+      <<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+          dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+}
+
+template void memcpyPermute<float>(float *dst, const float *src, int *src_size,
+                                   int *permute, int src_dim,
+                                   cudaStream_t stream);
+
+template <>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int m, int n,
+                                     int k, const float *alpha, const float *A,
+                                     int lda, const float *B, int ldb,
+                                     const float *beta, float *C, int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle,
+                                    cublasOperation_t transa,
+                                    cublasOperation_t transb, int m, int n,
+                                    int k, const half *alpha, const half *A,
+                                    int lda, const half *B, int ldb,
+                                    const half *beta, half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e32c1bfabc8831cc9f61bc1549853d87d12c879a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
@@ -0,0 +1,255 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_cummaxmin.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CUMMAXMIN_PLUGIN_NAME{"cummaxmin"};
+static const char *CUMMAX_PLUGIN_NAME{"cummax"};
+static const char *CUMMIN_PLUGIN_NAME{"cummin"};
+}  // namespace
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string &name, int dim,
+                                               TRT_CUMCMPTYPE cumType)
+    : mLayerName(name), mDim(dim), mCumType(cumType) {}
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string name,
+                                               const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mDim);
+  deserialize_value(&data, &length, &mCumType);
+}
+
+CumMaxMinPluginDynamic::~CumMaxMinPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CumMaxMinPluginDynamic::clone() const {
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(mLayerName, mDim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CumMaxMinPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CumMaxMinPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+              inOut[pos].type == nvinfer1::DataType::kINT32) &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    // output[1]
+    case 2:
+      return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    default:
+      return false;
+  }
+}
+
+void CumMaxMinPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CumMaxMinPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CumMaxMinPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+  int *output_index = (int *)outputs[1];
+
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+
+  switch (inputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      CumMaxMinForwardLauncher_float((float *)input, (float *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    case nvinfer1::DataType::kINT32:
+      CumMaxMinForwardLauncher_int32((int *)input, (int *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType CumMaxMinPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  switch (index) {
+    case 0:
+      return inputTypes[0];
+    case 1:
+      return nvinfer1::DataType::kINT32;
+    default:
+      break;
+  }
+}
+
+// IPluginV2 Methods
+const char *CumMaxMinPluginDynamic::getPluginType() const {
+  switch (mCumType) {
+    case TRT_CUMCMPTYPE::TRT_CUMMAX:
+      return CUMMAX_PLUGIN_NAME;
+    case TRT_CUMCMPTYPE::TRT_CUMMIN:
+      return CUMMIN_PLUGIN_NAME;
+    default:
+      return "UnknownCumType";
+  }
+}
+
+const char *CumMaxMinPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CumMaxMinPluginDynamic::getNbOutputs() const { return 2; }
+
+int CumMaxMinPluginDynamic::initialize() { return 0; }
+
+void CumMaxMinPluginDynamic::terminate() {}
+
+size_t CumMaxMinPluginDynamic::getSerializationSize() const {
+  return sizeof(mDim) + sizeof(mCumType);
+}
+
+void CumMaxMinPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mDim);
+  serialize_value(&buffer, mCumType);
+}
+
+void CumMaxMinPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CumMaxMinPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxMinPluginDynamicCreator::CumMaxMinPluginDynamicCreator(
+    TRT_CUMCMPTYPE cumType)
+    : mCumType(cumType) {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dim"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginName() const {
+  return CUMMAXMIN_PLUGIN_NAME;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CumMaxMinPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int dim = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("dim") == 0) {
+      dim = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(name, dim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CumMaxMinPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CumMaxMinPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxPluginDynamicCreator::CumMaxPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMAX) {}
+
+const char *CumMaxPluginDynamicCreator::getPluginName() const {
+  return CUMMAX_PLUGIN_NAME;
+}
+
+CumMinPluginDynamicCreator::CumMinPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMIN) {}
+
+const char *CumMinPluginDynamicCreator::getPluginName() const {
+  return CUMMIN_PLUGIN_NAME;
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0097fcb5b04d4d274a8f74b046beb0fdc3c7aafd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -0,0 +1,103 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <typename scalar_t>
+__global__ void cummaxmin_kernel(const scalar_t *input, scalar_t *output_value,
+                                 int *output_index, TensorDesc tensor_desc,
+                                 int cum_dim, int cum_type) {
+  const size_t cum_size = tensor_desc.shape[cum_dim];
+  const size_t cum_stride = tensor_desc.stride[cum_dim];
+  const size_t data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / cum_size;
+  CUDA_1D_KERNEL_LOOP(index, data_size) {
+    size_t cum_offset =
+        index / cum_stride * (cum_size * cum_stride) + index % cum_stride;
+    int cum_index = 0;
+    auto cum_value = input[cum_offset];
+    output_value[cum_offset] = cum_value;
+    output_index[cum_offset] = cum_index;
+
+    for (size_t cum_index_current = 1; cum_index_current < cum_size;
+         ++cum_index_current) {
+      cum_offset += cum_stride;
+      const auto cum_value_current = input[cum_offset];
+      switch (cum_type) {
+        case 0:  // max
+          if (cum_value_current > cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+        case 1:  // min
+          if (cum_value_current < cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+      }
+      output_value[cum_offset] = cum_value;
+      output_index[cum_offset] = cum_index;
+    }
+  }
+}
+
+template <typename scalar_t>
+void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
+                              int *output_index, const int *dims, int nbDims,
+                              int cum_dim, int cum_type, cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void *)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+
+  // cum dim should be larger than 0
+  cum_dim = cum_dim >= 0 ? cum_dim : (nbDims + cum_dim);
+
+  const int data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
+
+  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
+
+  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+      input, output_value, output_index, tensor_desc, cum_dim, cum_type);
+}
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<float>(input, output_value, output_index, dims,
+                                  nbDims, cum_dim, cum_type, stream);
+}
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<int>(input, output_value, output_index, dims, nbDims,
+                                cum_dim, cum_type, stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12776a4622850dd4c287de723156afffe1d2d32c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
@@ -0,0 +1,331 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *offset, float *output,
+    void *workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection DeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    DeformableConvPluginDynamicCreator::mPluginAttributes;
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims &stride,
+    const nvinfer1::Dims &padding, const nvinfer1::Dims &dilation,
+    const int deformableGroup, const int group, int im2colStep)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group),
+      mIm2colStep(im2colStep) {}
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name,
+                                                         const void *data,
+                                                         size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  deserialize_value(&data, &length, &mIm2colStep);
+}
+DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const {
+  DeformableConvPluginDynamic *plugin =
+      new DeformableConvPluginDynamic(mLayerName, mStride, mPadding, mDilation,
+                                      mDeformableGroup, mGroup, mIm2colStep);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[2].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool DeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void DeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t DeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[2].dims.d[2];
+  int kH = inputs[2].dims.d[3];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  size_t col_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * sizeof_dtype);
+
+  size_t out_size = 0;
+  if (im2col_step != 1)
+    out_size = mmcv::getAlignedSize(batch_size * nOutputPlane * outputHeight *
+                                    outputWidth * sizeof_dtype);
+
+  return col_size + out_size;
+}
+
+int DeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch_size = inputDesc[0].dims.d[0];
+  int inputChannel = inputDesc[0].dims.d[1];
+  int inputHeight = inputDesc[0].dims.d[2];
+  int inputWidth = inputDesc[0].dims.d[3];
+  int outputChannel = outputDesc[0].dims.d[1];
+  int kernelHeight = inputDesc[2].dims.d[2];
+  int kernelWidth = inputDesc[2].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *weight = inputs[2];
+  void *output = outputs[0];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      DeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)offset, (float *)output,
+          workSpace, batch_size, inputChannel, inputHeight, inputWidth,
+          outputChannel, kernelWidth, kernelHeight, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *DeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int DeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int DeformableConvPluginDynamic::initialize() { return 0; }
+
+void DeformableConvPluginDynamic::terminate() {}
+
+size_t DeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup) + sizeof(mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+  serialize_value(&buffer, mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void DeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void DeformableConvPluginDynamic::detachFromContext() {}
+
+void DeformableConvPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("bias"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("im2col_step"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+DeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+  int im2col_step = 32;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        stride.d[1] = stride.d[0];
+      } else {
+        stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        padding.d[1] = padding.d[0];
+      } else {
+        padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        dilation.d[1] = dilation.d[0];
+      } else {
+        dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("deformable_group") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("im2col_step") == 0) {
+      im2col_step = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
+      name, stride, padding, dilation, deformableGroup, group, im2col_step);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void DeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d38986616705f3704d08adfccd9a72481099e4ca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
@@ -0,0 +1,142 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_deformable_im2col(const T* data_input, const T* data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           T* data_col, cudaStream_t stream) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_input, data_offset, height, width, ksize_h, ksize_w,
+          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+          channel_per_deformable_group, parallel_imgs, channels,
+          deformable_group, height_col, width_col, data_col);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+void DeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
+    scalar_t* output, void* workspace, int batchSize, int nInputPlane,
+    int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t word_size = sizeof(scalar_t);
+
+  im2col_step = std::min(int(batchSize), im2col_step);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  long long columns_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * word_size);
+
+  // column buffer for img2col
+  scalar_t* columns = (scalar_t*)workspace;
+  workspace = workspace + columns_size;
+
+  scalar_t* output_buffer;
+  long long output_buffer_size = 0;
+  if (im2col_step == 1) {
+    output_buffer = output;
+  } else {
+    // output need permute when im2col_step!=1
+    output_buffer = (scalar_t*)workspace;
+    output_buffer_size = batchSize * nOutputPlane * outputWidth * outputHeight;
+  }
+
+  long long input_elt_step =
+      im2col_step * nInputPlane * inputHeight * inputWidth;
+  long long offset_elt_step =
+      im2col_step * deformable_group * 2 * kH * kW * outputHeight * outputWidth;
+  long long out_buffer_step =
+      nOutputPlane * im2col_step * outputHeight * outputWidth;
+  long long col_g_step =
+      nInputPlane * kW * kH / group * im2col_step * outputHeight * outputWidth;
+  long long weight_g_step =
+      nOutputPlane / group * nInputPlane / group * kH * kW;
+  long long out_buffer_g_step =
+      nOutputPlane / group * im2col_step * outputHeight * outputWidth;
+  int m = nOutputPlane / group;
+  int n = im2col_step * outputHeight * outputWidth;
+  int k = nInputPlane / group * kH * kW;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    const scalar_t* input_start = input + elt * input_elt_step;
+    const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+    trt_deformable_im2col<scalar_t>(input_start, offset_start, nInputPlane,
+                                    inputHeight, inputWidth, kH, kW, padH, padW,
+                                    dH, dW, dilationH, dilationW, im2col_step,
+                                    deformable_group, columns, stream);
+
+    for (int g = 0; g < group; ++g) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start =
+          output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (im2col_step != 1) {
+    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane,
+                                  im2col_step, outputHeight, outputWidth};
+    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
+                            &output_buffer_permute[0], 5, stream);
+  }
+}
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* offset, float* output,
+    void* workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  DeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, offset, output, workspace, batchSize, nInputPlane,
+      inputHeight, inputWidth, nOutputPlane, kW, kH, dW, dH, padW, padH,
+      dilationW, dilationH, group, deformable_group, im2col_step, cublas_handle,
+      stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6330af86f8f34c62158c377f33b96f4552f0b98
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
@@ -0,0 +1,269 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_grid_sampler.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"grid_sampler"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection GridSamplerDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField> GridSamplerDynamicCreator::mPluginAttributes;
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string &name, int mode,
+                                       int paddingMode, bool alignCorners)
+    : mLayerName(name),
+      mMode(mode),
+      mPaddingMode(paddingMode),
+      mAlignCorners(alignCorners) {}
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string name, const void *data,
+                                       size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mMode);
+  deserialize_value(&data, &length, &mPaddingMode);
+  deserialize_value(&data, &length, &mAlignCorners);
+}
+
+nvinfer1::IPluginV2DynamicExt *GridSamplerDynamic::clone() const {
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(mLayerName, mMode, mPaddingMode, mAlignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GridSamplerDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  for (int i = 2; i < ret.nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i - 1];
+  }
+  return ret;
+}
+
+bool GridSamplerDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void GridSamplerDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  // Validate input arguments
+}
+
+size_t GridSamplerDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int GridSamplerDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                const nvinfer1::PluginTensorDesc *outputDesc,
+                                const void *const *inputs, void *const *outputs,
+                                void *workSpace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims grid_dims = inputDesc[1].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+
+  using mmcv::GridSamplerInterpolation;
+  using mmcv::GridSamplerPadding;
+
+  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+  switch (mMode) {
+    case 0:
+      interp_mode = GridSamplerInterpolation::Bilinear;
+      break;
+    case 1:
+      interp_mode = GridSamplerInterpolation::Nearest;
+      break;
+    default:
+      break;
+  }
+
+  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+  switch (mPaddingMode) {
+    case 0:
+      padding_mode = GridSamplerPadding::Zeros;
+      break;
+
+    case 1:
+      padding_mode = GridSamplerPadding::Border;
+      break;
+
+    case 2:
+      padding_mode = GridSamplerPadding::Reflection;
+      break;
+    default:
+      break;
+  }
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      grid_sample_float(
+          (float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
+          &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
+          input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GridSamplerDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GridSamplerDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *GridSamplerDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int GridSamplerDynamic::getNbOutputs() const { return 1; }
+
+int GridSamplerDynamic::initialize() { return 0; }
+
+void GridSamplerDynamic::terminate() {}
+
+size_t GridSamplerDynamic::getSerializationSize() const {
+  return sizeof(mMode) + sizeof(mPaddingMode) + sizeof(mAlignCorners);
+}
+
+void GridSamplerDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mMode);
+  serialize_value(&buffer, mPaddingMode);
+  serialize_value(&buffer, mAlignCorners);
+}
+
+void GridSamplerDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void GridSamplerDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+GridSamplerDynamicCreator::GridSamplerDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("interpolation_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GridSamplerDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *GridSamplerDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+GridSamplerDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int mode = 0;
+  int paddingMode = 0;
+  bool alignCorners = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("interpolation_mode") == 0) {
+      mode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("padding_mode") == 0) {
+      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("align_corners") == 0) {
+      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
+    }
+  }
+
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(name, mode, paddingMode, alignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new GridSamplerDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void GridSamplerDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..55e528febf845340206f5dc3e843c94b7f29060c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
@@ -0,0 +1,454 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// modified from
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
+// and
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
+
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_grid_sampler.hpp"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+using mmcv::TensorDesc;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in,
+                                                            int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1),
+               ::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in,
+                                                               int twice_low,
+                                                               int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+safe_downgrade_to_int_range(scalar_t x) {
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
+    scalar_t coord, int size, GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H,
+                                                        int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w,
+                                                        int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_2d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_H = input_desc.shape[2];
+  int inp_W = input_desc.shape[3];
+  int out_H = grid_desc.shape[1];
+  int out_W = grid_desc.shape[2];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sH = input_desc.stride[2];
+  int inp_sW = input_desc.stride[3];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sH = grid_desc.stride[1];
+  int grid_sW = grid_desc.stride[2];
+  int grid_sCoor = grid_desc.stride[3];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sH = output_desc.stride[2];
+  int out_sW = output_desc.stride[3];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int n = index / (out_H * out_W);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ix));
+      int iy_nw = static_cast<int>(::floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix) * (iy_se - iy);
+      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+      // calculate bilinear weighted pixel value and set output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<scalar_t>(0);
+        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_3d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_D = input_desc.shape[2];
+  int inp_H = input_desc.shape[3];
+  int inp_W = input_desc.shape[4];
+  int out_D = grid_desc.shape[1];
+  int out_H = grid_desc.shape[2];
+  int out_W = grid_desc.shape[3];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sD = input_desc.stride[2];
+  int inp_sH = input_desc.stride[3];
+  int inp_sW = input_desc.stride[4];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sD = grid_desc.stride[1];
+  int grid_sH = grid_desc.stride[2];
+  int grid_sW = grid_desc.stride[3];
+  int grid_sCoor = grid_desc.stride[4];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sD = output_desc.stride[2];
+  int out_sH = output_desc.stride[3];
+  int out_sW = output_desc.stride[4];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int d = (index / (out_H * out_W)) % out_D;
+    const int n = index / (out_D * out_H * out_W);
+    const int grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+        //   tne
+        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+        // tse
+        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+        // bne
+        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+        // bse
+        *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+      int iz_nearest = static_cast<int>(::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H,
+                             inp_W)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
+  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+  desc.stride[nb_dims - 1] = 1;
+  for (int i = nb_dims - 2; i >= 0; --i) {
+    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+  }
+}
+
+template <typename T>
+void grid_sample(T *output, const T *input, const T *grid, int *output_dims,
+                 int *input_dims, int *grid_dims, int nb_dims,
+                 GridSamplerInterpolation interp, GridSamplerPadding padding,
+                 bool align_corners, cudaStream_t stream) {
+  TensorDesc input_desc;
+  create_desc(input_dims, nb_dims, input_desc);
+
+  TensorDesc output_desc;
+  create_desc(output_dims, nb_dims, output_desc);
+
+  TensorDesc grid_desc;
+  create_desc(grid_dims, nb_dims, grid_desc);
+
+  int count = 1;
+  for (int i = 0; i < nb_dims; ++i) {
+    if (i == 1) {
+      continue;
+    }
+    count *= output_desc.shape[i];
+  }
+
+  if (nb_dims == 4) {
+    grid_sampler_2d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else if (nb_dims == 5) {
+    grid_sampler_3d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else {
+    printf("input and grid dims should be 4 or 5\n");
+  }
+}
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream) {
+  grid_sample<float>(output, input, grid, output_dims, input_dims, grid_dims,
+                     nb_dims, interp, padding, align_corners, stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0de896b32526f85742717e64e43a37241d64b1bd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
@@ -0,0 +1,257 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "trt_instance_norm.hpp"
+
+#include <cuda_fp16.h>
+
+#include <stdexcept>
+
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
+                                      cudnnDataType_t* cudnn_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      *cudnn_dtype = CUDNN_DATA_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *cudnn_dtype = CUDNN_DATA_HALF;
+      break;
+    default:
+      return CUDNN_STATUS_BAD_PARAM;
+  }
+  return CUDNN_STATUS_SUCCESS;
+}
+
+namespace {
+constexpr const char* PLUGIN_VERSION{"1"};
+constexpr const char* PLUGIN_NAME{"MMCVInstanceNormalization"};
+}  // namespace
+
+PluginFieldCollection InstanceNormalizationDynamicCreator::mFC{};
+std::vector<PluginField> InstanceNormalizationDynamicCreator::mPluginAttributes;
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, float epsilon)
+    : mLayerName(name), mEpsilon(epsilon) {}
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, void const* serialData, size_t serialLength)
+    : mLayerName(name) {
+  deserialize_value(&serialData, &serialLength, &mEpsilon);
+}
+
+InstanceNormalizationDynamic::~InstanceNormalizationDynamic() {}
+
+// InstanceNormalizationDynamic returns one output.
+int InstanceNormalizationDynamic::getNbOutputs() const { return 1; }
+
+DimsExprs InstanceNormalizationDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  return output;
+}
+
+int InstanceNormalizationDynamic::initialize() { return 0; }
+
+void InstanceNormalizationDynamic::terminate() {}
+
+size_t InstanceNormalizationDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  int n = inputs[0].dims.d[0];
+  int c = inputs[0].dims.d[1];
+  int elem_size = mmcv::getElementSize(inputs[1].type);
+  return mmcv::getAlignedSize(n * c * elem_size) * 2;
+}
+
+int InstanceNormalizationDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int h = input_dims.d[2];
+  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+  int elem_size = mmcv::getElementSize(inputDesc[1].type);
+
+  void* n_scales = (void*)workspace;
+  void* n_bias = (void*)(workspace + mmcv::getAlignedSize(n * c * elem_size));
+
+  const void* scales = (const void*)inputs[1];
+  const void* bias = (const void*)inputs[2];
+
+  for (int i = 0; i < n; ++i) {
+    cudaMemcpyAsync(n_scales + i * c * elem_size, scales, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+    cudaMemcpyAsync(n_bias + i * c * elem_size, bias, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+  }
+
+  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1,
+                             n * c, 1, 1);
+  cudnnDataType_t cudnn_dtype{};
+  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  float alpha = 1;
+  float beta = 0;
+  void const* x_ptr = inputs[0];
+  void* y_ptr = outputs[0];
+  cudnnSetStream(_cudnn_handle, stream);
+  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+  //       acceptable.
+  cudnnBatchNormalizationForwardTraining(
+      _cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc,
+      x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr,
+      mEpsilon, nullptr, nullptr);
+  return 0;
+}
+
+size_t InstanceNormalizationDynamic::getSerializationSize() const {
+  return serialized_size(mEpsilon);
+}
+
+void InstanceNormalizationDynamic::serialize(void* buffer) const {
+  serialize_value(&buffer, mEpsilon);
+}
+
+bool InstanceNormalizationDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  return ((inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+           inOut[pos].type == nvinfer1::DataType::kHALF) &&
+          inOut[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+          inOut[pos].type == inOut[0].type);
+}
+
+const char* InstanceNormalizationDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+void InstanceNormalizationDynamic::destroy() { delete this; }
+
+IPluginV2DynamicExt* InstanceNormalizationDynamic::clone() const {
+  auto* plugin = new InstanceNormalizationDynamic{mLayerName, mEpsilon};
+  plugin->setPluginNamespace(mPluginNamespace.c_str());
+  return plugin;
+}
+
+// Set plugin namespace
+void InstanceNormalizationDynamic::setPluginNamespace(
+    const char* pluginNamespace) {
+  mPluginNamespace = pluginNamespace;
+}
+
+const char* InstanceNormalizationDynamic::getPluginNamespace() const {
+  return mPluginNamespace.c_str();
+}
+
+nvinfer1::DataType InstanceNormalizationDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the
+// access to some context resource.
+void InstanceNormalizationDynamic::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    IGpuAllocator* gpuAllocator) {
+  _cudnn_handle = cudnnContext;
+  cudnnCreateTensorDescriptor(&_b_desc);
+  cudnnCreateTensorDescriptor(&_x_desc);
+  cudnnCreateTensorDescriptor(&_y_desc);
+}
+
+// Detach the plugin object from its execution context.
+void InstanceNormalizationDynamic::detachFromContext() {
+  cudnnDestroyTensorDescriptor(_y_desc);
+  cudnnDestroyTensorDescriptor(_x_desc);
+  cudnnDestroyTensorDescriptor(_b_desc);
+}
+
+void InstanceNormalizationDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+// InstanceNormalizationDynamicCreator methods
+InstanceNormalizationDynamicCreator::InstanceNormalizationDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(
+      PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const PluginFieldCollection*
+InstanceNormalizationDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  float epsilon = 1e-5;
+  const PluginField* fields = fc->fields;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "epsilon")) {
+      epsilon = *(static_cast<const float*>(fields[i].data));
+    }
+  }
+
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic(name, epsilon);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) {
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic{name, serialData, serialLength};
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+void InstanceNormalizationDynamicCreator::setPluginNamespace(
+    const char* libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ac9c77506d89d027e0fd9544484d3f132597f9a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
@@ -0,0 +1,321 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_modulated_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *bias,
+    const float *offset, const float *mask, float *output, void *workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection
+    ModulatedDeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ModulatedDeformableConvPluginDynamicCreator::mPluginAttributes;
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims stride,
+    const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
+    const int deformableGroup, const int group)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group) {
+  mWithBias = false;
+}
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string name, const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  mWithBias = false;
+}
+ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone()
+    const {
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(
+          mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[3].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void ModulatedDeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  if (nbInputs == 5) {
+    mWithBias = true;
+  }
+}
+
+size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[3].dims.d[2];
+  int kH = inputs[3].dims.d[3];
+  int im2col_step = std::min(32, batch_size);
+
+  size_t col_size = mmcv::getAlignedSize(nInputPlane * kW * kH * outputHeight *
+                                         outputWidth * sizeof_dtype);
+
+  return col_size;
+}
+
+int ModulatedDeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int channels_out = outputDesc[0].dims.d[1];
+  int kernel_h = inputDesc[3].dims.d[2];
+  int kernel_w = inputDesc[3].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *mask = inputs[2];
+  const void *weight = inputs[3];
+  const void *bias = mWithBias ? inputs[4] : nullptr;
+  void *output = outputs[0];
+  int im2col_step = std::min(batch, 32);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      ModulatedDeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)bias, (float *)offset,
+          (float *)mask, (float *)output, workSpace, batch, channels, height,
+          width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ModulatedDeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ModulatedDeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int ModulatedDeformableConvPluginDynamic::initialize() { return 0; }
+
+void ModulatedDeformableConvPluginDynamic::terminate() {}
+
+size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ModulatedDeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void ModulatedDeformableConvPluginDynamic::detachFromContext() {}
+
+void ModulatedDeformableConvPluginDynamic::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ModulatedDeformableConvPluginDynamicCreator::
+    ModulatedDeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion()
+    const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ModulatedDeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("deformable_group") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+  }
+
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(name, stride, padding, dilation,
+                                               deformableGroup, group);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *
+ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin =
+      new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ModulatedDeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginNamespace()
+    const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..99f8d9bb2639bad5a15689a76bda0fe5061d6473
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
@@ -0,0 +1,147 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_modulated_deformable_im2col(
+    const T* data_im_, const T* data_offset_, const T* data_mask_,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, T* data_col_,
+    cudaStream_t stream) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  modulated_deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im,
+          kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+          dilation_w, channel_per_deformable_group, batch_size, channels,
+          deformable_group, height_col, width_col, data_col_);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias,
+                                       size_t step_batch, size_t step_channel,
+                                       size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    output[index] += bias[(index % step_batch) / step_channel];
+  }
+}
+
+template <typename scalar_t>
+static void output_add_bias(scalar_t* output, const scalar_t* bias,
+                            size_t batch, size_t channel, size_t height,
+                            size_t width, cudaStream_t stream) {
+  size_t step_channel = height * width;
+  size_t step_batch = step_channel * channel;
+  size_t n = step_batch * batch;
+  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
+      output, bias, step_batch, step_channel, n);
+}
+
+template <typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* bias,
+    const scalar_t* offset, const scalar_t* mask, scalar_t* output,
+    void* workspace, int batch, int channels, int height, int width,
+    int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h,
+    int pad_w, int pad_h, int dilation_w, int dilation_h, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t sizeof_dtype = sizeof(scalar_t);
+  bool with_bias = (bias != nullptr);
+
+  im2col_step = std::min(int(batch), im2col_step);
+  assert(batch % im2col_step == 0);
+  const int channels_kernel = channels / group;
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  scalar_t* columns = (scalar_t*)workspace;
+
+  const size_t input_step = channels * height * width;
+  const size_t offset_step =
+      deformable_group * kernel_h * kernel_w * 2 * height * width;
+  const size_t mask_step =
+      deformable_group * kernel_h * kernel_w * height * width;
+  const size_t out_step = channels_out * height_out * width_out;
+  const size_t out_group_step = out_step / group;
+  const size_t col_g_step =
+      channels * kernel_w * kernel_h / group * height_out * width_out;
+  const size_t weight_g_step =
+      channels_out / group * channels / group * kernel_h * kernel_w;
+
+  const int m = channels_out / group;
+  const int n = height_out * width_out;
+  const int k = channels / group * kernel_h * kernel_w;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int b = 0; b < batch; b++) {
+    const scalar_t* input_start = input + b * input_step;
+    const scalar_t* offset_start = offset + b * offset_step;
+    const scalar_t* mask_start = mask + b * mask_step;
+    trt_modulated_deformable_im2col<scalar_t>(
+        input_start, offset_start, mask_start, 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, columns, stream);
+
+    for (int g = 0; g < group; g++) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
+
+      // cudaMemsetAsync(out_buffer_start, 0, 1, stream);
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (with_bias) {
+    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out,
+                              width_out, stream);
+  }
+}
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* bias,
+    const float* offset, const float* mask, float* output, void* workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, bias, offset, mask, output, workspace, batch, channels,
+      height, width, channels_out, kernel_w, kernel_h, stride_w, stride_h,
+      pad_w, pad_h, dilation_w, dilation_h, group, deformable_group,
+      im2col_step, cublas_handle, stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..298ed010317f055717398a56e530bb0f50879d69
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms.cpp
@@ -0,0 +1,292 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_nms.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern size_t get_onnxnms_workspace_size(
+    size_t num_batches, size_t spatial_dimension, size_t num_classes,
+    size_t boxes_word_size, int center_point_box, size_t output_length);
+
+extern void TRTNMSCUDAKernelLauncher_float(
+    const float *boxes, const float *scores,
+    const int max_output_boxes_per_class, const float iou_threshold,
+    const float score_threshold, const int offset, int *output,
+    int center_point_box, int num_batches, int spatial_dimension,
+    int num_classes, size_t output_length, void *workspace,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"NonMaxSuppression"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    NonMaxSuppressionDynamicCreator::mPluginAttributes;
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
+    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
+    float iouThreshold, float scoreThreshold, int offset)
+    : mLayerName(name),
+      mCenterPointBox(centerPointBox),
+      mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
+      mIouThreshold(iouThreshold),
+      mScoreThreshold(scoreThreshold),
+      mOffset(offset) {}
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
+                                                   const void *data,
+                                                   size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mCenterPointBox);
+  deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
+  deserialize_value(&data, &length, &mIouThreshold);
+  deserialize_value(&data, &length, &mScoreThreshold);
+  deserialize_value(&data, &length, &mOffset);
+}
+
+nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
+  NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
+      mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
+      mScoreThreshold, mOffset);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  auto num_batches = inputs[0].d[0];
+  auto spatial_dimension = inputs[0].d[1];
+  if (mMaxOutputBoxesPerClass > 0) {
+    spatial_dimension = exprBuilder.operation(
+        nvinfer1::DimensionOperation::kMIN, *spatial_dimension,
+        *exprBuilder.constant(mMaxOutputBoxesPerClass));
+  }
+  auto num_classes = inputs[1].d[1];
+  ret.d[0] = exprBuilder.operation(
+      nvinfer1::DimensionOperation::kPROD, *num_batches,
+      *exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
+                             *spatial_dimension, *num_classes));
+  ret.d[1] = exprBuilder.constant(3);
+
+  return ret;
+}
+
+bool NonMaxSuppressionDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // boxes
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 1:
+        // scores
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // selected_indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void NonMaxSuppressionDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t NonMaxSuppressionDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t boxes_word_size = mmcv::getElementSize(inputs[0].type);
+  size_t num_batches = inputs[0].dims.d[0];
+  size_t spatial_dimension = inputs[0].dims.d[1];
+  size_t num_classes = inputs[1].dims.d[1];
+  size_t output_length = outputs[0].dims.d[0];
+
+  return get_onnxnms_workspace_size(num_batches, spatial_dimension, num_classes,
+                                    boxes_word_size, mCenterPointBox,
+                                    output_length);
+}
+
+int NonMaxSuppressionDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int num_batches = inputDesc[0].dims.d[0];
+  int spatial_dimension = inputDesc[0].dims.d[1];
+  int num_classes = inputDesc[1].dims.d[1];
+  int output_length = outputDesc[0].dims.d[0];
+
+  const float *boxes = (const float *)inputs[0];
+  const float *scores = (const float *)inputs[1];
+  int *output = (int *)outputs[0];
+  TRTNMSCUDAKernelLauncher_float(
+      boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
+      mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
+      num_classes, output_length, workSpace, stream);
+
+  return 0;
+}
+
+nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return nvinfer1::DataType::kINT32;
+}
+
+// IPluginV2 Methods
+const char *NonMaxSuppressionDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
+
+int NonMaxSuppressionDynamic::initialize() { return 0; }
+
+void NonMaxSuppressionDynamic::terminate() {}
+
+size_t NonMaxSuppressionDynamic::getSerializationSize() const {
+  return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
+         sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
+}
+
+void NonMaxSuppressionDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mCenterPointBox);
+  serialize_value(&buffer, mMaxOutputBoxesPerClass);
+  serialize_value(&buffer, mIouThreshold);
+  serialize_value(&buffer, mScoreThreshold);
+  serialize_value(&buffer, mOffset);
+}
+
+void NonMaxSuppressionDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
+  mPluginAttributes.emplace_back(
+      nvinfer1::PluginField("max_output_boxes_per_class"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("iou_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("score_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("offset"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+NonMaxSuppressionDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int centerPointBox = 0;
+  int maxOutputBoxesPerClass = 0;
+  float iouThreshold = 0.0f;
+  float scoreThreshold = 0.0f;
+  int offset = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("center_point_box") == 0) {
+      centerPointBox = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("max_output_boxes_per_class") == 0) {
+      maxOutputBoxesPerClass = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("iou_threshold") == 0) {
+      iouThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("score_threshold") == 0) {
+      scoreThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("offset") == 0) {
+      offset = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+  NonMaxSuppressionDynamic *plugin =
+      new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
+                                   iouThreshold, scoreThreshold, offset);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void NonMaxSuppressionDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f1b7d2ff89068f2ac5e2b115fd05caa6590a079b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -0,0 +1,287 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "nms_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+struct NMSBox {
+  float box[4];
+};
+
+struct nms_centerwh2xyxy {
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    NMSBox out;
+    out.box[0] = box.box[0] - box.box[2] / 2.0f;
+    out.box[1] = box.box[1] - box.box[3] / 2.0f;
+    out.box[2] = box.box[0] + box.box[2] / 2.0f;
+    out.box[3] = box.box[1] + box.box[3] / 2.0f;
+    return out;
+  }
+};
+
+struct nms_sbox_idle {
+  const float* idle_box_;
+  __host__ __device__ nms_sbox_idle(const float* idle_box) {
+    idle_box_ = idle_box;
+  }
+
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    return {idle_box_[0], idle_box_[1], idle_box_[2], idle_box_[3]};
+  }
+};
+
+struct nms_score_threshold {
+  float score_threshold_;
+  __host__ __device__ nms_score_threshold(const float score_threshold) {
+    score_threshold_ = score_threshold;
+  }
+
+  __host__ __device__ bool operator()(const float score) {
+    return score < score_threshold_;
+  }
+};
+
+__global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int old_index = output[index * 3 + 2];
+    output[index * 3 + 2] = index_cache[old_index];
+  }
+}
+
+__global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
+                                      const int* index, int* output,
+                                      int* output_count, int batch_id,
+                                      int cls_id, int spatial_dimension,
+                                      int col_blocks,
+                                      int max_output_boxes_per_class) {
+  extern __shared__ unsigned long long remv[];
+
+  // fill remv with 0
+  CUDA_1D_KERNEL_LOOP(i, col_blocks) { remv[i] = 0; }
+  __syncthreads();
+
+  int start = *output_count;
+  int out_per_class_count = 0;
+  for (int i = 0; i < spatial_dimension; i++) {
+    const int nblock = i / threadsPerBlock;
+    const int inblock = i % threadsPerBlock;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      if (threadIdx.x == 0) {
+        output[start * 3 + 0] = batch_id;
+        output[start * 3 + 1] = cls_id;
+        output[start * 3 + 2] = index[i];
+        start += 1;
+      }
+      out_per_class_count += 1;
+      if (out_per_class_count >= max_output_boxes_per_class) {
+        break;
+      }
+      __syncthreads();
+      // set every overlap box with bit 1 in remv
+      const unsigned long long* p = dev_mask + i * col_blocks;
+      CUDA_1D_KERNEL_LOOP(j, col_blocks) {
+        if (j >= nblock) {
+          remv[j] |= p[j];
+        }
+      }  // j
+      __syncthreads();
+    }
+  }  // i
+  if (threadIdx.x == 0) {
+    *output_count = start;
+  }
+}
+
+size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
+                                  size_t num_classes, size_t boxes_word_size,
+                                  int center_point_box, size_t output_length) {
+  size_t boxes_xyxy_workspace = 0;
+  if (center_point_box == 1) {
+    boxes_xyxy_workspace = mmcv::getAlignedSize(
+        num_batches * spatial_dimension * 4 * boxes_word_size);
+  }
+  size_t scores_workspace =
+      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
+  size_t boxes_workspace =
+      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
+  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                               sizeof(unsigned long long));
+  size_t index_template_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t index_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t count_workspace = mmcv::getAlignedSize(sizeof(int));
+  return scores_workspace + boxes_xyxy_workspace + boxes_workspace +
+         mask_workspace + index_template_workspace + index_workspace +
+         count_workspace;
+}
+
+/**
+ * Launch the NonMaxSuppression kernel
+ *
+ * The NMS will be performed on each batch/class, share the kernel implement
+ * `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
+ * be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
+ * that, the output would be generated by `mask_to_output_kernel` with
+ * `dev_mask` and `sorted_cache`.
+ *
+ * @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
+ * @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
+ *     input scores
+ * @param[in] max_output_boxes_per_class max output boxes per class
+ * @param[in] iou_threshold threshold of iou
+ * @param[in] score_threshold threshold of scores
+ * @param[in] offset box offset, only 0 or 1 is valid
+ * @param[out] output with shape [output_length, 3], each row contain index
+ *     (batch_id, class_id, boxes_id), filling -1 if result is not valid.
+ * @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
+ *     boxes is [center_x, center_y, width, height]
+ * @param[in] num_batches batch size of boxes and scores
+ * @param[in] spatial_dimension boxes numbers each batch
+ * @param[in] num_classes class numbers
+ * @param[in] output_length the max output rows
+ * @param[in] workspace memory for all temporary variables.
+ * @param[in] stream cuda stream
+ */
+void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
+                                    const int max_output_boxes_per_class,
+                                    const float iou_threshold,
+                                    const float score_threshold,
+                                    const int offset, int* output,
+                                    int center_point_box, int num_batches,
+                                    int spatial_dimension, int num_classes,
+                                    size_t output_length, void* workspace,
+                                    cudaStream_t stream) {
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
+  float* boxes_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
+
+  float* boxes_xyxy = nullptr;
+  if (center_point_box == 1) {
+    boxes_xyxy = (float*)workspace;
+    workspace = static_cast<char*>(workspace) +
+                mmcv::getAlignedSize(num_batches * spatial_dimension * 4 *
+                                     sizeof(float));
+    thrust::transform(thrust::cuda::par.on(stream), (NMSBox*)boxes,
+                      (NMSBox*)(boxes + num_batches * spatial_dimension * 4),
+                      (NMSBox*)boxes_xyxy, nms_centerwh2xyxy());
+    cudaCheckError();
+  }
+
+  float* scores_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(float));
+
+  unsigned long long* dev_mask = (unsigned long long*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                   sizeof(unsigned long long));
+
+  int* index_cache = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+
+  // generate sequence [0,1,2,3,4 ....]
+  int* index_template = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  thrust::sequence(thrust::cuda::par.on(stream), index_template,
+                   index_template + spatial_dimension, 0);
+
+  int max_output_boxes_per_class_cpu = max_output_boxes_per_class;
+  if (max_output_boxes_per_class_cpu <= 0) {
+    max_output_boxes_per_class_cpu = spatial_dimension;
+  }
+
+  int* output_count = (int*)workspace;
+  workspace = static_cast<char*>(workspace) + mmcv::getAlignedSize(sizeof(int));
+  cudaMemsetAsync(output_count, 0, sizeof(int), stream);
+
+  // fill output with -1
+  thrust::fill(thrust::cuda::par.on(stream), output, output + output_length * 3,
+               -1);
+  cudaCheckError();
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+
+  for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+    for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
+      const int batch_cls_id = batch_id * num_classes + cls_id;
+
+      // sort boxes by score
+      cudaMemcpyAsync(scores_sorted, scores + batch_cls_id * spatial_dimension,
+                      spatial_dimension * sizeof(float),
+                      cudaMemcpyDeviceToDevice, stream);
+      cudaCheckError();
+
+      cudaMemcpyAsync(index_cache, index_template,
+                      spatial_dimension * sizeof(int), cudaMemcpyDeviceToDevice,
+                      stream);
+      cudaCheckError();
+
+      thrust::sort_by_key(thrust::cuda::par.on(stream), scores_sorted,
+                          scores_sorted + spatial_dimension, index_cache,
+                          thrust::greater<float>());
+
+      if (center_point_box == 1) {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes_xyxy + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      } else {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      }
+
+      cudaCheckError();
+
+      if (score_threshold > 0.0f) {
+        thrust::transform_if(
+            thrust::cuda::par.on(stream), (NMSBox*)boxes_sorted,
+            (NMSBox*)(boxes_sorted + spatial_dimension * 4), scores_sorted,
+            (NMSBox*)boxes_sorted, nms_sbox_idle(boxes_sorted),
+            nms_score_threshold(score_threshold));
+      }
+
+      nms_cuda<<<blocks, threads, 0, stream>>>(spatial_dimension, iou_threshold,
+                                               offset, boxes_sorted, dev_mask);
+
+      // will be performed when dev_mask is full.
+      mask_to_output_kernel<<<1, threadsPerBlock,
+                              col_blocks * sizeof(unsigned long long),
+                              stream>>>(
+          dev_mask, index_cache, output, output_count, batch_id, cls_id,
+          spatial_dimension, col_blocks, max_output_boxes_per_class_cpu);
+    }  // cls_id
+  }    // batch_id
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_plugin.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_plugin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a0312d183f110043d4c68e1ddf679b8e72b7ecd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_plugin.cpp
@@ -0,0 +1,40 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_plugin.hpp"
+
+#include "trt_corner_pool.hpp"
+#include "trt_cummaxmin.hpp"
+#include "trt_deform_conv.hpp"
+#include "trt_grid_sampler.hpp"
+#include "trt_instance_norm.hpp"
+#include "trt_modulated_deform_conv.hpp"
+#include "trt_nms.hpp"
+#include "trt_roi_align.hpp"
+#include "trt_scatternd.hpp"
+
+REGISTER_TENSORRT_PLUGIN(CumMaxPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CumMinPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(GridSamplerDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(InstanceNormalizationDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CornerPoolPluginDynamicCreator);
+
+extern "C" {
+bool initLibMMCVInferPlugins() { return true; }
+}  // extern "C"
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c78de4c0aefea2eecaa02b9150a0e35aeb6b3b1b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
@@ -0,0 +1,307 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_roi_align.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float *input, const float *rois, float *output, float *argmax_y,
+    float *argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVRoiAlign"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection RoIAlignPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    RoIAlignPluginDynamicCreator::mPluginAttributes;
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
+                                             int outWidth, int outHeight,
+                                             float spatialScale,
+                                             int sampleRatio, int poolMode,
+                                             bool aligned)
+    : mLayerName(name),
+      mOutWidth(outWidth),
+      mOutHeight(outHeight),
+      mSpatialScale(spatialScale),
+      mSampleRatio(sampleRatio),
+      mPoolMode(poolMode),
+      mAligned(aligned) {}
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name,
+                                             const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mOutWidth);
+  deserialize_value(&data, &length, &mOutHeight);
+  deserialize_value(&data, &length, &mSpatialScale);
+  deserialize_value(&data, &length, &mSampleRatio);
+  deserialize_value(&data, &length, &mPoolMode);
+  deserialize_value(&data, &length, &mAligned);
+}
+
+nvinfer1::IPluginV2DynamicExt *RoIAlignPluginDynamic::clone() const {
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode,
+      mAligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = exprBuilder.constant(mOutHeight);
+  ret.d[3] = exprBuilder.constant(mOutWidth);
+
+  return ret;
+}
+
+bool RoIAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+void RoIAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t RoIAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t output_size = 0;
+  size_t word_size = 0;
+  switch (mPoolMode) {
+    case 0:  // max
+      output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] *
+                    outputs[0].dims.d[2] * outputs[0].dims.d[3];
+      word_size = mmcv::getElementSize(outputs[0].type);
+      return output_size * word_size * 2;
+      break;
+    case 1:
+      return 0;
+      break;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs,
+                                   void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) {
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+
+  int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] *
+                    outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
+  int word_size = mmcv::getElementSize(outputDesc[0].type);
+
+  const void *feat = inputs[0];
+  const void *rois = inputs[1];
+  void *output = outputs[0];
+  void *argmax_y = nullptr;
+  void *argmax_x = nullptr;
+
+  switch (mPoolMode) {
+    case 0:  // max
+      argmax_y = workSpace;
+      argmax_x = argmax_y + output_size * word_size;
+      break;
+    case 1:  // avg
+      break;
+  }
+
+  switch (outputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTRoIAlignForwardCUDAKernelLauncher_float(
+          (const float *)feat, (const float *)rois, (float *)output,
+          (float *)argmax_y, (float *)argmax_x, output_size, channels, height,
+          width, mOutHeight, mOutWidth, mSpatialScale, mSampleRatio, mPoolMode,
+          mAligned, stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType RoIAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *RoIAlignPluginDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *RoIAlignPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int RoIAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoIAlignPluginDynamic::initialize() { return 0; }
+
+void RoIAlignPluginDynamic::terminate() {}
+
+size_t RoIAlignPluginDynamic::getSerializationSize() const {
+  return sizeof(mOutWidth) + sizeof(mOutHeight) + sizeof(mSpatialScale) +
+         sizeof(mSampleRatio) + sizeof(mPoolMode) + sizeof(mAligned);
+}
+
+void RoIAlignPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mOutWidth);
+  serialize_value(&buffer, mOutHeight);
+  serialize_value(&buffer, mSpatialScale);
+  serialize_value(&buffer, mSampleRatio);
+  serialize_value(&buffer, mPoolMode);
+  serialize_value(&buffer, mAligned);
+}
+
+void RoIAlignPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void RoIAlignPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+RoIAlignPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int outWidth = 7;
+  int outHeight = 7;
+  float spatialScale = 1.0;
+  int sampleRatio = 0;
+  int poolMode = -1;
+  bool aligned = true;
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("output_height") == 0) {
+      outHeight = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("output_width") == 0) {
+      outWidth = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("spatial_scale") == 0) {
+      spatialScale = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("sampling_ratio") == 0) {
+      sampleRatio = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("mode") == 0) {
+      int data_size = fc->fields[i].length;
+      const char *data_start = static_cast<const char *>(fc->fields[i].data);
+      std::string poolModeStr(data_start, data_size);
+      if (poolModeStr == "avg") {
+        poolMode = 1;
+      } else if (poolModeStr == "max") {
+        poolMode = 0;
+      } else {
+        std::cout << "Unknown pool mode \"" << poolModeStr << "\"."
+                  << std::endl;
+      }
+      assert(poolMode >= 0);
+    }
+
+    if (field_name.compare("aligned") == 0) {
+      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
+      aligned = aligned_int != 0;
+    }
+  }
+
+  assert(outHeight > 0);
+  assert(outWidth > 0);
+  assert(spatialScale > 0.);
+  assert(poolMode >= 0);
+
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new RoIAlignPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void RoIAlignPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a42fee29c7873a3eb48add06600081699db223d6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
@@ -0,0 +1,41 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "common_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels,
+    int height, int width, int aligned_height, int aligned_width,
+    scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned,
+    cudaStream_t stream) {
+  roi_align_forward_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input, rois, output, argmax_y, argmax_x, aligned_height,
+          aligned_width, static_cast<scalar_t>(spatial_scale), sampling_ratio,
+          pool_mode, aligned, channels, height, width);
+}
+
+void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float* input, const float* rois, float* output, float* argmax_y,
+    float* argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
+  TRTRoIAlignForwardCUDAKernelLauncher<float>(
+      input, rois, output, argmax_y, argmax_x, output_size, channels, height,
+      width, aligned_height, aligned_width, spatial_scale, sampling_ratio,
+      pool_mode, aligned, stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd.cpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39289cfb7742716dd143ea213731c98ddaa69d51
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
@@ -0,0 +1,220 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "trt_scatternd.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTONNXScatterNDKernelLauncher_float(
+    const float *data, const int *indices, const float *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, float *output,
+    cudaStream_t stream);
+
+extern void TRTONNXScatterNDKernelLauncher_int32(
+    const int *data, const int *indices, const int *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, int *output,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"ScatterND"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection ONNXScatterNDDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ONNXScatterNDDynamicCreator::mPluginAttributes;
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name)
+    : mLayerName(name) {}
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string name,
+                                           const void *data, size_t length)
+    : mLayerName(name) {}
+
+nvinfer1::IPluginV2DynamicExt *ONNXScatterNDDynamic::clone() const {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ONNXScatterNDDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool ONNXScatterNDDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // data
+        return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+               (inOut[pos].type == nvinfer1::DataType::kINT32 &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+      case 1:
+        // indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 2:
+        // updates
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // output
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void ONNXScatterNDDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t ONNXScatterNDDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                  const nvinfer1::PluginTensorDesc *outputDesc,
+                                  const void *const *inputs,
+                                  void *const *outputs, void *workSpace,
+                                  cudaStream_t stream) {
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  const int *indices_dims = &(inputDesc[1].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+  int indice_nbDims = inputDesc[1].dims.nbDims;
+
+  const void *data = inputs[0];
+  const void *indices = inputs[1];
+  const void *update = inputs[2];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTONNXScatterNDKernelLauncher_float(
+          (float *)data, (int *)indices, (float *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (float *)output, stream);
+      break;
+
+    case nvinfer1::DataType::kINT32:
+      TRTONNXScatterNDKernelLauncher_int32(
+          (int *)data, (int *)indices, (int *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (int *)output, stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ONNXScatterNDDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ONNXScatterNDDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *ONNXScatterNDDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ONNXScatterNDDynamic::getNbOutputs() const { return 1; }
+
+int ONNXScatterNDDynamic::initialize() { return 0; }
+
+void ONNXScatterNDDynamic::terminate() {}
+
+size_t ONNXScatterNDDynamic::getSerializationSize() const { return 0; }
+
+void ONNXScatterNDDynamic::serialize(void *buffer) const {}
+
+void ONNXScatterNDDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ONNXScatterNDDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ONNXScatterNDDynamicCreator::ONNXScatterNDDynamicCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ONNXScatterNDDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new ONNXScatterNDDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3f6f0cd5da5571dd1d10cecd1318a92395b31e78
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -0,0 +1,106 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+static int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+using mmcv::TensorDesc;
+
+template <typename T>
+__global__ void onnx_scatternd_kernel(const int n, const int* indices,
+                                      const T* update, T* output,
+                                      TensorDesc tensor_desc,
+                                      TensorDesc indice_desc) {
+  const int indice_cols = indice_desc.shape[indice_desc.dim - 1];
+  const int copy_stride = tensor_desc.stride[indice_cols - 1];
+  const int* stride = &(tensor_desc.stride[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int output_offset = 0;
+    const int* indices_current = indices + index * indice_cols;
+    for (int i = 0; i < indice_cols; ++i) {
+      output_offset += stride[i] * indices_current[i];
+    }
+    memcpy(output + output_offset, update + index * copy_stride,
+           copy_stride * sizeof(T));
+  }
+}
+
+template <typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
+                                    const T* update, const int* dims,
+                                    int nbDims, const int* indices_dims,
+                                    int indice_nbDims, T* output,
+                                    cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+  const int data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
+
+  TensorDesc indice_desc;
+  memset((void*)&indice_desc, 0, sizeof(TensorDesc));
+  indice_desc.dim = indice_nbDims;
+  indice_desc.shape[indice_nbDims - 1] = indices_dims[indice_nbDims - 1];
+  indice_desc.stride[indice_nbDims - 1] = 1;
+  for (int i = indice_nbDims - 2; i >= 0; --i) {
+    indice_desc.shape[i] = indices_dims[i];
+    indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
+  }
+
+  // output = np.copy(data)
+  cudaMemcpyAsync(output, data, data_size * sizeof(T),
+                  cudaMemcpyDeviceToDevice);
+
+  int num_update_indice = 1;
+  for (int i = 0; i < indice_nbDims - 1; ++i) {
+    num_update_indice *= indice_desc.shape[i];
+  }
+  // scatter
+  const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
+  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
+      num_update_indice, indices, update, output, tensor_desc, indice_desc);
+}
+
+void TRTONNXScatterNDKernelLauncher_float(const float* data, const int* indices,
+                                          const float* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, float* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<float>(data, indices, update, dims, nbDims,
+                                        indices_dims, indice_nbDims, output,
+                                        stream);
+}
+
+void TRTONNXScatterNDKernelLauncher_int32(const int* data, const int* indices,
+                                          const int* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, int* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<int>(data, indices, update, dims, nbDims,
+                                      indices_dims, indice_nbDims, output,
+                                      stream);
+}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_corner_pool.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_corner_pool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f34e15b312ac9f9c3d3f0b5d413711423723f8cb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_corner_pool.hpp
@@ -0,0 +1,111 @@
+#ifndef TRT_CORNER_POOL_HPP
+#define TRT_CORNER_POOL_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CORNER_POOL_TYPE {
+  TRT_TOP_POOL = 0,
+  TRT_BOTTOM_POOL = 1,
+  TRT_LEFT_POOL = 2,
+  TRT_RIGHT_POOL = 3
+};
+
+// implement of CornerPool
+class CornerPoolPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CornerPoolPluginDynamic(const std::string &name,
+                          TRT_CORNER_POOL_TYPE poolType);
+
+  CornerPoolPluginDynamic(const std::string name, const void *data,
+                          size_t length);
+
+  CornerPoolPluginDynamic() = delete;
+
+  ~CornerPoolPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  TRT_CORNER_POOL_TYPE mPoolType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// CornerPool creator
+class CornerPoolPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CornerPoolPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif TRT_CORNER_POOL_HPP  // TRT_CORNER_POOL_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cuda_helper.cuh b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cuda_helper.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6c910a76a3e6e289155d2c1741f66d233ec0a0bf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -0,0 +1,52 @@
+// encoding=utf-8
+// Copyright 2021 Huawei Technologies Co., Ltd
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TRT_CUDA_HELPER_HPP
+#define TRT_CUDA_HELPER_HPP
+#include <cublas_v2.h>
+
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(0);                                                 \
+    }                                                          \
+  }
+
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size,
+                   int* permute, int src_dim, cudaStream_t stream = 0);
+
+template <typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k,
+                              const scalar_t* alpha, const scalar_t* A, int lda,
+                              const scalar_t* B, int ldb, const scalar_t* beta,
+                              scalar_t* C, int ldc) {
+  return CUBLAS_STATUS_INTERNAL_ERROR;
+}
+
+#endif  // TRT_CUDA_HELPER_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cummaxmin.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cummaxmin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b856b02fb523a9c2dc3ac08dfc4265f754ca224
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_cummaxmin.hpp
@@ -0,0 +1,122 @@
+#ifndef TRT_CUMMAXMIN_HPP
+#define TRT_CUMMAXMIN_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CUMCMPTYPE { TRT_CUMMAX = 0, TRT_CUMMIN = 1 };
+
+// implement of cummax and cummin
+class CumMaxMinPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CumMaxMinPluginDynamic(const std::string &name, int dim,
+                         TRT_CUMCMPTYPE cumType);
+
+  CumMaxMinPluginDynamic(const std::string name, const void *data,
+                         size_t length);
+
+  CumMaxMinPluginDynamic() = delete;
+
+  ~CumMaxMinPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mDim;
+  TRT_CUMCMPTYPE mCumType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// cummax and cummin creator
+class CumMaxMinPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE cumType);
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  TRT_CUMCMPTYPE mCumType;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+// cummax creator
+class CumMaxPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMaxPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+// cummin creator
+class CumMinPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMinPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+#endif TRT_CUMMAXMIN_HPP  // TRT_CUMMAXMIN_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_deform_conv.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_deform_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc48ac5dd956f93a571c2a929fd4cf273c6b249a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_deform_conv.hpp
@@ -0,0 +1,118 @@
+#ifndef TRT_DEFORM_CONV_HPP
+#define TRT_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class DeformableConvPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  DeformableConvPluginDynamic(const std::string &name,
+                              const nvinfer1::Dims &stride,
+                              const nvinfer1::Dims &padding,
+                              const nvinfer1::Dims &dilation,
+                              const int deformableGroup, const int group,
+                              int im2colStep);
+
+  DeformableConvPluginDynamic(const std::string name, const void *data,
+                              size_t length);
+
+  DeformableConvPluginDynamic() = delete;
+
+  ~DeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  int mIm2colStep;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class DeformableConvPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  DeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_DEFORM_CONV_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_grid_sampler.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_grid_sampler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..40920ce5f490d8f5369b34dde8e8116e141a9ff3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_grid_sampler.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_GRID_SAMPLER_HPP
+#define TRT_GRID_SAMPLER_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+namespace mmcv {
+enum class GridSamplerInterpolation { Bilinear, Nearest };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+}  // namespace mmcv
+
+class GridSamplerDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  GridSamplerDynamic(const std::string &name, int mode, int paddingMode,
+                     bool alignCorners);
+
+  GridSamplerDynamic(const std::string name, const void *data, size_t length);
+
+  GridSamplerDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mMode;
+  int mPaddingMode;
+  bool mAlignCorners;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class GridSamplerDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GridSamplerDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_GRID_SAMPLER_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_instance_norm.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_instance_norm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78060c390151691057e53f6fc1eae648d1aa95bb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_instance_norm.hpp
@@ -0,0 +1,120 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
+
+#ifndef TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#define TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#include <cudnn.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+typedef unsigned short half_type;
+
+class InstanceNormalizationDynamic final
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  InstanceNormalizationDynamic(const std::string& name, float epsilon);
+
+  InstanceNormalizationDynamic(const std::string& name, void const* serialData,
+                               size_t serialLength);
+
+  InstanceNormalizationDynamic() = delete;
+
+  ~InstanceNormalizationDynamic() override;
+
+  int getNbOutputs() const override;
+
+  // DynamicExt plugins returns DimsExprs class instead of Dims
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  int initialize() override;
+
+  void terminate() override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  size_t getSerializationSize() const override;
+
+  void serialize(void* buffer) const override;
+
+  // DynamicExt plugin supportsFormat update.
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  const char* getPluginType() const override;
+
+  const char* getPluginVersion() const override;
+
+  void destroy() override;
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
+                       nvinfer1::IGpuAllocator* allocator) override;
+
+  void detachFromContext() override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+
+ private:
+  const std::string mLayerName;
+  float mEpsilon{};
+  cudnnHandle_t _cudnn_handle{};
+  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+  std::string mPluginNamespace{};
+};
+
+class InstanceNormalizationDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  InstanceNormalizationDynamicCreator();
+
+  ~InstanceNormalizationDynamicCreator() override = default;
+
+  const char* getPluginName() const override;
+
+  const char* getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2DynamicExt* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(
+      const char* name, const void* serialData, size_t serialLength) override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif  // TRT_INSTANCE_NORMALIZATION_PLUGIN_H
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0907e7ea854b5936a4099b3c5a515ce932de11ed
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
@@ -0,0 +1,120 @@
+#ifndef TRT_MODULATED_DEFORM_CONV_HPP
+#define TRT_MODULATED_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ModulatedDeformableConvPluginDynamic
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ModulatedDeformableConvPluginDynamic(const std::string &name,
+                                       const nvinfer1::Dims stride,
+                                       const nvinfer1::Dims padding,
+                                       const nvinfer1::Dims dilation,
+                                       const int deformableGroup,
+                                       const int group);
+
+  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data,
+                                       size_t length);
+
+  ModulatedDeformableConvPluginDynamic() = delete;
+
+  ~ModulatedDeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  bool mWithBias;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ModulatedDeformableConvPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
+ public:
+  ModulatedDeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_nms.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_nms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a914d9094aacf70a3d9e52c1a526b00a11026d7b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_nms.hpp
@@ -0,0 +1,107 @@
+#ifndef TRT_NMS_HPP
+#define TRT_NMS_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  NonMaxSuppressionDynamic(const std::string &name, int centerPointBox,
+                           int maxOutputBoxesPerClass, float iouThreshold,
+                           float scoreThreshold, int offset);
+
+  NonMaxSuppressionDynamic(const std::string name, const void *data,
+                           size_t length);
+
+  NonMaxSuppressionDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mCenterPointBox;
+  int mMaxOutputBoxesPerClass;
+  float mIouThreshold;
+  float mScoreThreshold;
+  int mOffset;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  NonMaxSuppressionDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_NMS_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4adf29d22e8bb8bdd0972311ea1aef732e21075
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin.hpp
@@ -0,0 +1,7 @@
+#ifndef TRT_PLUGIN_HPP
+#define TRT_PLUGIN_HPP
+
+extern "C" {
+bool initLibMMCVInferPlugins();
+}  // extern "C"
+#endif  // TRT_PLUGIN_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin_helper.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..70fba7810b30ab2539db2ff50792d36ef85e90f1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_plugin_helper.hpp
@@ -0,0 +1,41 @@
+#ifndef TRT_PLUGIN_HELPER_HPP
+#define TRT_PLUGIN_HELPER_HPP
+#include <stdexcept>
+
+#include "NvInferPlugin.h"
+
+namespace mmcv {
+
+const int MAXTENSORDIMS = 10;
+
+struct TensorDesc {
+  int shape[MAXTENSORDIMS];
+  int stride[MAXTENSORDIMS];
+  int dim;
+};
+
+inline unsigned int getElementSize(nvinfer1::DataType t) {
+  switch (t) {
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    // case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    default:
+      throw std::runtime_error("Invalid DataType.");
+  }
+  throw std::runtime_error("Invalid DataType.");
+  return 0;
+}
+
+inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
+  return size_t((origin_size + aligned_number - 1) / aligned_number) *
+         aligned_number;
+}
+
+}  // namespace mmcv
+#endif  // TRT_PLUGIN_HELPER_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_roi_align.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_roi_align.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5677af90b062ab259f084511e49e2a562a2017bc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_roi_align.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_ROI_ALIGN_HPP
+#define TRT_ROI_ALIGN_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  RoIAlignPluginDynamic(const std::string &name, int outWidth, int outHeight,
+                        float spatialScale, int sampleRatio, int poolMode,
+                        bool aligned);
+
+  RoIAlignPluginDynamic(const std::string name, const void *data,
+                        size_t length);
+
+  RoIAlignPluginDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mOutWidth;
+  int mOutHeight;
+  float mSpatialScale;
+  int mSampleRatio;
+  int mPoolMode;  // 1:avg 0:max
+  bool mAligned;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoIAlignPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_ROI_ALIGN_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_scatternd.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_scatternd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6087cbefb5cacd7f5bdc41c606662e16f05874f4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_scatternd.hpp
@@ -0,0 +1,98 @@
+#ifndef TRT_SCATTERND_HPP
+#define TRT_SCATTERND_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ONNXScatterNDDynamic(const std::string &name);
+
+  ONNXScatterNDDynamic(const std::string name, const void *data, size_t length);
+
+  ONNXScatterNDDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  ONNXScatterNDDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_SCATTERND_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_serialize.hpp b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_serialize.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f0899fdfe4f3db7d73ad13b9ff16a03c6941d7b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/csrc/tensorrt/trt_serialize.hpp
@@ -0,0 +1,105 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
+
+#ifndef TRT_SERIALIZE_HPP
+#define TRT_SERIALIZE_HPP
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <type_traits>
+#include <vector>
+using std::cerr;
+using std::cout;
+using std::endl;
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
+#endif  // TRT_SERIALIZE_HPP
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_conv.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..210867583b49644425c0aed9f7b86ec572707fdc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_conv.py
@@ -0,0 +1,418 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1,
+                bias=False,
+                im2col_step=32):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super(DeformConv2d, self).__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@CONV_LAYERS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super(DeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_roi_pool.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e4a3472bb052e0e8341c9952e2704e12217042
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deform_roi_pool.py
@@ -0,0 +1,217 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
+
+
+class DeformRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, offset, output_size, spatial_scale,
+                 sampling_ratio, gamma):
+        return g.op(
+            'mmcv::MMCVDeformRoIPool',
+            input,
+            rois,
+            offset,
+            pooled_height_i=output_size[0],
+            pooled_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_f=sampling_ratio,
+            gamma_f=gamma)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                rois,
+                offset,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                gamma=0.1):
+        if offset is None:
+            offset = input.new_zeros(0)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = float(spatial_scale)
+        ctx.sampling_ratio = int(sampling_ratio)
+        ctx.gamma = float(gamma)
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+
+        ext_module.deform_roi_pool_forward(
+            input,
+            rois,
+            offset,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+
+        ctx.save_for_backward(input, rois, offset)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, offset = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(input.shape)
+        grad_offset = grad_output.new_zeros(offset.shape)
+
+        ext_module.deform_roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            grad_input,
+            grad_offset,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        if grad_offset.numel() == 0:
+            grad_offset = None
+        return grad_input, None, grad_offset, None, None, None, None
+
+
+deform_roi_pool = DeformRoIPoolFunction.apply
+
+
+class DeformRoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPool, self).__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.gamma = float(gamma)
+
+    def forward(self, input, rois, offset=None):
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class DeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
+                                                sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, input, rois):
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class ModulatedDeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(ModulatedDeformRoIPoolPack,
+              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+        self.mask_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 1),
+            nn.Sigmoid())
+        self.mask_fc[2].weight.data.zero_()
+        self.mask_fc[2].bias.data.zero_()
+
+    def forward(self, input, rois):
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        mask = self.mask_fc(x.view(rois_num, -1))
+        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
+        d = deform_roi_pool(input, rois, offset, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        return d * mask
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deprecated_wrappers.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deprecated_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2afc4719e304898b4a5cc5fbb5261c3e9510fb2a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/deprecated_wrappers.py
@@ -0,0 +1,59 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is for backward compatibility.
+# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
+import warnings
+
+from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
+
+
+class Conv2d_deprecated(Conv2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class ConvTranspose2d_deprecated(ConvTranspose2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
+            'deprecated in the future. Please import them from "mmcv.cnn" '
+            'instead', DeprecationWarning)
+
+
+class MaxPool2d_deprecated(MaxPool2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class Linear_deprecated(Linear):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/focal_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1985017423c1d21f03e03a857109ba9480d403e0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/focal_loss.py
@@ -0,0 +1,225 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+
+
+class SigmoidFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSigmoidFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        output = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, target, weight = ctx.saved_tensors
+
+        grad_input = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+
+
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SigmoidFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+
+
+class SoftmaxFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSoftmaxFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+
+
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+
+
+class SoftmaxFocalLoss(nn.Module):
+
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SoftmaxFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/furthest_point_sample.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbb94fd01949bbe0b17ca97e90684faebc5fb73
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/furthest_point_sample.py
@@ -0,0 +1,98 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'furthest_point_sampling_forward',
+    'furthest_point_sampling_with_dist_forward'
+])
+
+
+class FurthestPointSampling(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_xyz: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_xyz.is_contiguous()
+
+        B, N = points_xyz.size()[:2]
+        output = torch.cuda.IntTensor(B, num_points)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        ext_module.furthest_point_sampling_forward(
+            points_xyz,
+            temp,
+            output,
+            b=B,
+            n=N,
+            m=num_points,
+        )
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+class FurthestPointSamplingWithDist(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_dist: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_dist (torch.Tensor): (B, N, N) Distance between each point
+                pair.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_dist.is_contiguous()
+
+        B, N, _ = points_dist.size()
+        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
+        temp = points_dist.new_zeros([B, N]).fill_(1e10)
+
+        ext_module.furthest_point_sampling_with_dist_forward(
+            points_dist, temp, output, b=B, n=N, m=num_points)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/fused_bias_leakyrelu.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f2f7e6e2ca522a83024adf4c625bd40f33be41
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/fused_bias_leakyrelu.py
@@ -0,0 +1,190 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =======================================================================
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])
+
+
+class FusedBiasLeakyReLUFunctionBackward(Function):
+    """Calculate second order deviation.
+
+    This function is to compute the second order deviation for the fused leaky
+    relu operation.
+    """
+
+    @staticmethod
+    def forward(ctx, grad_output, out, negative_slope, scale):
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = ext_module.fused_bias_leakyrelu(
+            grad_output,
+            empty,
+            out,
+            act=3,
+            grad=1,
+            alpha=negative_slope,
+            scale=scale)
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        grad_bias = grad_input.sum(dim).detach()
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input, gradgrad_bias):
+        out, = ctx.saved_tensors
+
+        # The second order deviation, in fact, contains two parts, while the
+        # the first part is zero. Thus, we direct consider the second part
+        # which is similar with the first order deviation in implementation.
+        gradgrad_out = ext_module.fused_bias_leakyrelu(
+            gradgrad_input,
+            gradgrad_bias.to(out.dtype),
+            out,
+            act=3,
+            grad=1,
+            alpha=ctx.negative_slope,
+            scale=ctx.scale)
+
+        return gradgrad_out, None, None, None
+
+
+class FusedBiasLeakyReLUFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, bias, negative_slope, scale):
+        empty = input.new_empty(0)
+
+        out = ext_module.fused_bias_leakyrelu(
+            input,
+            bias,
+            empty,
+            act=3,
+            grad=0,
+            alpha=negative_slope,
+            scale=scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        out, = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale)
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedBiasLeakyReLU(nn.Module):
+    r"""Fused bias leaky ReLU.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    TODO: Implement the CPU version.
+
+    Args:
+        channel (int): The channel number of the feature map.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+    """
+
+    def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5):
+        super(FusedBiasLeakyReLU, self).__init__()
+
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input):
+        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
+                                    self.scale)
+
+
+def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5):
+    r"""Fused bias leaky ReLU function.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    Args:
+        input (torch.Tensor): Input feature map.
+        bias (nn.Parameter): The bias from convolution operation.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+
+    Returns:
+        torch.Tensor: Feature map after non-linear activation.
+    """
+
+    if not input.is_cuda:
+        return bias_leakyrelu_ref(input, bias, negative_slope, scale)
+
+    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
+                                            negative_slope, scale)
+
+
+def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5):
+
+    if bias is not None:
+        assert bias.ndim == 1
+        assert bias.shape[0] == x.shape[1]
+        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])
+
+    x = F.leaky_relu(x, negative_slope)
+    if scale != 1:
+        x = x * scale
+
+    return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/gather_points.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc75bb401628534919c8d663688a88dfcf72d223
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/gather_points.py
@@ -0,0 +1,71 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['gather_points_forward', 'gather_points_backward'])
+
+
+class GatherPoints(Function):
+    """Gather points with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, N) features to gather.
+            indices (torch.Tensor): (B, M) where M is the number of points.
+
+        Returns:
+            torch.Tensor: (B, C, M) where M is the number of points.
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+
+        B, npoint = indices.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, npoint)
+
+        ext_module.gather_points_forward(
+            features, indices, output, b=B, c=C, n=N, npoints=npoint)
+
+        ctx.for_backwards = (indices, C, N)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(indices)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.gather_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint)
+        return grad_features, None
+
+
+gather_points = GatherPoints.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/group_points.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c7e4b2bf6e11baa3aacdc4161abb9bb964a0d0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/group_points.py
@@ -0,0 +1,254 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+from .ball_query import ball_query
+from .knn import knn
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['group_points_forward', 'group_points_backward'])
+
+
+class QueryAndGroup(nn.Module):
+    """Groups points with a ball query of radius.
+
+    Args:
+        max_radius (float): The maximum radius of the balls.
+            If None is given, we will use kNN sampling instead of ball query.
+        sample_num (int): Maximum number of features to gather in the ball.
+        min_radius (float, optional): The minimum radius of the balls.
+            Default: 0.
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
+            Default: False.
+        normalize_xyz (bool, optional): Whether to normalize xyz.
+            Default: False.
+        uniform_sample (bool, optional): Whether to sample uniformly.
+            Default: False
+        return_unique_cnt (bool, optional): Whether to return the count of
+            unique samples. Default: False.
+        return_grouped_idx (bool, optional): Whether to return grouped idx.
+            Default: False.
+    """
+
+    def __init__(self,
+                 max_radius,
+                 sample_num,
+                 min_radius=0,
+                 use_xyz=True,
+                 return_grouped_xyz=False,
+                 normalize_xyz=False,
+                 uniform_sample=False,
+                 return_unique_cnt=False,
+                 return_grouped_idx=False):
+        super().__init__()
+        self.max_radius = max_radius
+        self.min_radius = min_radius
+        self.sample_num = sample_num
+        self.use_xyz = use_xyz
+        self.return_grouped_xyz = return_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.uniform_sample = uniform_sample
+        self.return_unique_cnt = return_unique_cnt
+        self.return_grouped_idx = return_grouped_idx
+        if self.return_unique_cnt:
+            assert self.uniform_sample, \
+                'uniform_sample should be True when ' \
+                'returning the count of unique samples'
+        if self.max_radius is None:
+            assert not self.normalize_xyz, \
+                'can not normalize grouped xyz when max_radius is None'
+
+    def forward(self, points_xyz, center_xyz, features=None):
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
+                points.
+            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
+                centriods.
+            features (torch.Tensor): (B, C, N) The features of grouped
+                points.
+
+        Returns:
+            torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
+            concatenated coordinates and features of points.
+        """
+        # if self.max_radius is None, we will perform kNN instead of ball query
+        # idx is of shape [B, npoint, sample_num]
+        if self.max_radius is None:
+            idx = knn(self.sample_num, points_xyz, center_xyz, False)
+            idx = idx.transpose(1, 2).contiguous()
+        else:
+            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
+                             points_xyz, center_xyz)
+
+        if self.uniform_sample:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(
+                        0,
+                        num_unique, (self.sample_num - num_unique, ),
+                        dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+        xyz_trans = points_xyz.transpose(1, 2).contiguous()
+        # (B, 3, npoint, sample_num)
+        grouped_xyz = grouping_operation(xyz_trans, idx)
+        grouped_xyz_diff = grouped_xyz - \
+            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
+        if self.normalize_xyz:
+            grouped_xyz_diff /= self.max_radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                # (B, C + 3, npoint, sample_num)
+                new_features = torch.cat([grouped_xyz_diff, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            assert (self.use_xyz
+                    ), 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz_diff
+
+        ret = [new_features]
+        if self.return_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.return_unique_cnt:
+            ret.append(unique_cnt)
+        if self.return_grouped_idx:
+            ret.append(idx)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    """Group xyz with feature.
+
+    Args:
+        use_xyz (bool): Whether to use xyz.
+    """
+
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: torch.Tensor = None):
+        """
+        Args:
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            new_xyz (Tensor): new xyz coordinates of the features.
+            features (Tensor): (B, C, N) features to group.
+
+        Returns:
+            Tensor: (B, C + 3, 1, N) Grouped feature.
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                # (B, 3 + C, 1, N)
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features
+
+
+class GroupingOperation(Function):
+    """Group feature with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (Tensor): (B, C, N) tensor of features to group.
+            indices (Tensor): (B, npoint, nsample) the indices of
+                features to group with.
+
+        Returns:
+            Tensor: (B, C, npoint, nsample) Grouped features.
+        """
+        features = features.contiguous()
+        indices = indices.contiguous()
+
+        B, nfeatures, nsample = indices.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+
+        ext_module.group_points_forward(
+            features,
+            indices,
+            output,
+            b=B,
+            c=C,
+            n=N,
+            npoints=nfeatures,
+            nsample=nsample)
+
+        ctx.for_backwards = (indices, N)
+        return output
+
+    @staticmethod
+    def backward(ctx,
+                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
+                of the output from forward.
+
+        Returns:
+            Tensor: (B, C, N) gradient of the features.
+        """
+        idx, N = ctx.for_backwards
+
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.group_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint,
+            nsample=nsample)
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/info.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..25eddf675a9ab74fe78017c899d17029b40dfeea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/info.py
@@ -0,0 +1,49 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os
+
+import torch
+
+if torch.__version__ == 'parrots':
+    import parrots
+
+    def get_compiler_version():
+        return 'GCC ' + parrots.version.compiler
+
+    def get_compiling_cuda_version():
+        return parrots.version.cuda
+else:
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext(
+        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
+
+    def get_compiler_version():
+        return ext_module.get_compiler_version()
+
+    def get_compiling_cuda_version():
+        return ext_module.get_compiling_cuda_version()
+
+
+def get_onnxruntime_op_path():
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_ort.*.so')
+
+    paths = glob.glob(wildcard)
+    if len(paths) > 0:
+        return paths[0]
+    else:
+        return ''
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/iou3d.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/iou3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1635470ea41bdc62e94599a5402292d20d393b08
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/iou3d.py
@@ -0,0 +1,102 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward',
+    'iou3d_nms_normal_forward'
+])
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
+
+    Returns:
+        torch.Tensor: IoU result with shape (M, N).
+    """
+    ans_iou = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+
+    ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(),
+                                           boxes_b.contiguous(), ans_iou)
+
+    return ans_iou
+
+
+def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
+    """NMS function GPU implementation (for BEV boxes). The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_bev(boxes, scores, thresh):
+    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/knn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..554149858ec76f6dc58a4c0f708fd8807008b932
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/knn.py
@@ -0,0 +1,92 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
+
+
+class KNN(Function):
+    r"""KNN (CUDA) based on heap data structure.
+
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/pointops/src/knnquery_heap>`_.
+
+    Find k-nearest points.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                k: int,
+                xyz: torch.Tensor,
+                center_xyz: torch.Tensor = None,
+                transposed: bool = False) -> torch.Tensor:
+        """
+        Args:
+            k (int): number of nearest neighbors.
+            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
+                (B, 3, N). xyz coordinates of the features.
+            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
+                is False, else (B, 3, npoint). centers of the knn query.
+                Default: None.
+            transposed (bool, optional): whether the input tensors are
+                transposed. Should not explicitly use this keyword when
+                calling knn (=KNN.apply), just add the fourth param.
+                Default: False.
+
+        Returns:
+            torch.Tensor: (B, k, npoint) tensor with the indices of the
+            features that form k-nearest neighbours.
+        """
+        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
+
+        if center_xyz is None:
+            center_xyz = xyz
+
+        if transposed:
+            xyz = xyz.transpose(2, 1).contiguous()
+            center_xyz = center_xyz.transpose(2, 1).contiguous()
+
+        assert xyz.is_contiguous()  # [B, N, 3]
+        assert center_xyz.is_contiguous()  # [B, npoint, 3]
+
+        center_xyz_device = center_xyz.get_device()
+        assert center_xyz_device == xyz.get_device(), \
+            'center_xyz and xyz should be put on the same device'
+        if torch.cuda.current_device() != center_xyz_device:
+            torch.cuda.set_device(center_xyz_device)
+
+        B, npoint, _ = center_xyz.shape
+        N = xyz.shape[1]
+
+        idx = center_xyz.new_zeros((B, npoint, k)).int()
+        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
+
+        ext_module.knn_forward(
+            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+        # idx shape to [B, k, npoint]
+        idx = idx.transpose(2, 1).contiguous()
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None
+
+
+knn = KNN.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/masked_conv.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/masked_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae208cb44d06d44dbc8e35683aeb285bfd0bbaef
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/masked_conv.py
@@ -0,0 +1,123 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride):
+        return g.op(
+            'mmcv::MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding_i=padding,
+            stride_i=stride)
+
+    @staticmethod
+    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
+        out_w = int(
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(MaskedConv2d,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+
+    def forward(self, input, mask=None):
+        if mask is None:  # fallback to the normal Conv2d
+            return super(MaskedConv2d, self).forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/merge_cells.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89f10ea44ee85a8157eb3136b5dad330110d6ae
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/merge_cells.py
@@ -0,0 +1,162 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..cnn import ConvModule
+
+
+class BaseMergeCell(nn.Module):
+    """The basic class for cells used in NAS-FPN and NAS-FCOS.
+
+    BaseMergeCell takes 2 inputs. After applying convolution
+    on them, they are resized to the target size. Then,
+    they go through binary_op, which depends on the type of cell.
+    If with_out_conv is True, the result of output will go through
+    another convolution layer.
+
+    Args:
+        in_channels (int): number of input channels in out_conv layer.
+        out_channels (int): number of output channels in out_conv layer.
+        with_out_conv (bool): Whether to use out_conv layer
+        out_conv_cfg (dict): Config dict for convolution layer, which should
+            contain "groups", "kernel_size", "padding", "bias" to build
+            out_conv layer.
+        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
+        out_conv_order (tuple): The order of conv/norm/activation layers in
+            out_conv.
+        with_input1_conv (bool): Whether to use convolution on input1.
+        with_input2_conv (bool): Whether to use convolution on input2.
+        input_conv_cfg (dict): Config dict for building input1_conv layer and
+            input2_conv layer, which is expected to contain the type of
+            convolution.
+            Default: None, which means using conv2d.
+        input_norm_cfg (dict): Config dict for normalization layer in
+            input1_conv and input2_conv layer. Default: None.
+        upsample_mode (str): Interpolation method used to resize the output
+            of input1_conv and input2_conv to target size. Currently, we
+            support ['nearest', 'bilinear']. Default: 'nearest'.
+    """
+
+    def __init__(self,
+                 fused_channels=256,
+                 out_channels=256,
+                 with_out_conv=True,
+                 out_conv_cfg=dict(
+                     groups=1, kernel_size=3, padding=1, bias=True),
+                 out_norm_cfg=None,
+                 out_conv_order=('act', 'conv', 'norm'),
+                 with_input1_conv=False,
+                 with_input2_conv=False,
+                 input_conv_cfg=None,
+                 input_norm_cfg=None,
+                 upsample_mode='nearest'):
+        super(BaseMergeCell, self).__init__()
+        assert upsample_mode in ['nearest', 'bilinear']
+        self.with_out_conv = with_out_conv
+        self.with_input1_conv = with_input1_conv
+        self.with_input2_conv = with_input2_conv
+        self.upsample_mode = upsample_mode
+
+        if self.with_out_conv:
+            self.out_conv = ConvModule(
+                fused_channels,
+                out_channels,
+                **out_conv_cfg,
+                norm_cfg=out_norm_cfg,
+                order=out_conv_order)
+
+        self.input1_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input1_conv else nn.Sequential()
+        self.input2_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input2_conv else nn.Sequential()
+
+    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
+        return ConvModule(
+            channel,
+            channel,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    @abstractmethod
+    def _binary_op(self, x1, x2):
+        pass
+
+    def _resize(self, x, size):
+        if x.shape[-2:] == size:
+            return x
+        elif x.shape[-2:] < size:
+            return F.interpolate(x, size=size, mode=self.upsample_mode)
+        else:
+            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
+            kernel_size = x.shape[-1] // size[-1]
+            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
+            return x
+
+    def forward(self, x1, x2, out_size=None):
+        assert x1.shape[:2] == x2.shape[:2]
+        assert out_size is None or len(out_size) == 2
+        if out_size is None:  # resize to larger one
+            out_size = max(x1.size()[2:], x2.size()[2:])
+
+        x1 = self.input1_conv(x1)
+        x2 = self.input2_conv(x2)
+
+        x1 = self._resize(x1, out_size)
+        x2 = self._resize(x2, out_size)
+
+        x = self._binary_op(x1, x2)
+        if self.with_out_conv:
+            x = self.out_conv(x)
+        return x
+
+
+class SumCell(BaseMergeCell):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        return x1 + x2
+
+
+class ConcatCell(BaseMergeCell):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
+                                         **kwargs)
+
+    def _binary_op(self, x1, x2):
+        ret = torch.cat([x1, x2], dim=1)
+        return ret
+
+
+class GlobalPoolingCell(BaseMergeCell):
+
+    def __init__(self, in_channels=None, out_channels=None, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _binary_op(self, x1, x2):
+        x2_att = self.global_pool(x2).sigmoid()
+        return x2 + x2_att * x1
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/min_area_polygons.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..360d832e65c916df22daa5f1b95f9e320b244bfd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/min_area_polygons.py
@@ -0,0 +1,31 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
+
+
+def min_area_polygons(pointsets):
+    """Find the smallest polygons that surrounds all points in the point sets.
+
+    Args:
+        pointsets (Tensor): point sets with shape  (N, 18).
+
+    Returns:
+        torch.Tensor: Return the smallest polygons with shape (N, 8).
+    """
+    polygons = pointsets.new_zeros((pointsets.size(0), 8))
+    ext_module.min_area_polygons(pointsets, polygons)
+    return polygons
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/modulated_deform_conv.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96b09ac1d3494c4a3314beffbfd5e9e7f4bbe7a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/modulated_deform_conv.py
@@ -0,0 +1,296 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+
+
+class ModulatedDeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        input_tensors = [input, offset, mask, weight]
+        if bias is not None:
+            input_tensors.append(bias)
+        return g.op(
+            'mmcv::MMCVModulatedDeformConv2d',
+            *input_tensors,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        bias = bias.type_as(input)
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+
+
+class ModulatedDeformConv2d(nn.Module):
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='ModulatedDeformConv2d')
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deform_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv2d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+@CONV_LAYERS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        super(ModulatedDeformConv2dPack, self).init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/multi_scale_deform_attn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0261c095dca47647c1797be76d58e7b873f27373
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/multi_scale_deform_attn.py
@@ -0,0 +1,370 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import Function, once_differentiable
+
+from mmcv import deprecated_api_warning
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.runner import BaseModule
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (torch.Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (torch.Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (torch.Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
+
+        Returns:
+            tuple[Tensor]: Gradient of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights):
+    """CPU version of multi-scale deformable attention.
+
+    Args:
+        value (torch.Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (torch.Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (torch.Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (torch.Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+
+    Returns:
+        torch.Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (torch.Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (torch.Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (torch.Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (torch.Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (torch.Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (torch.Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (torch.Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (torch.Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (torch.Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+            torch.Tensor: forwarded results with shape
+            [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/nms.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..7619cdf7791095df700cd7c9da2dc68b9b7cd36c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/nms.py
@@ -0,0 +1,459 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import torch
+
+from mmcv.utils import deprecated_api_warning
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated'])
+
+
+# This function is modified from: https://github.com/pytorch/vision/
+class NMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold,
+                max_num):
+        is_filtering_by_score = score_threshold > 0
+        if is_filtering_by_score:
+            valid_mask = scores > score_threshold
+            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+            valid_inds = torch.nonzero(
+                valid_mask, as_tuple=False).squeeze(dim=1)
+
+        inds = ext_module.nms(
+            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+
+        if max_num > 0:
+            inds = inds[:max_num]
+        if is_filtering_by_score:
+            inds = valid_inds[inds]
+        return inds
+
+    @staticmethod
+    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
+                 max_num):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
+        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+        if has_custom_op and (not is_trt_backend):
+            return g.op(
+                'mmcv::NonMaxSuppression',
+                bboxes,
+                scores,
+                iou_threshold_f=float(iou_threshold),
+                offset_i=int(offset))
+        else:
+            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+            from ..onnx.onnx_utils.symbolic_helper import _size_helper
+
+            boxes = unsqueeze(g, bboxes, 0)
+            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+
+            if max_num > 0:
+                max_num = g.op(
+                    'Constant',
+                    value_t=torch.tensor(max_num, dtype=torch.long))
+            else:
+                dim = g.op('Constant', value_t=torch.tensor(0))
+                max_num = _size_helper(g, bboxes, dim)
+            max_output_per_class = max_num
+            iou_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([iou_threshold], dtype=torch.float))
+            score_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([score_threshold], dtype=torch.float))
+            nms_out = g.op('NonMaxSuppression', boxes, scores,
+                           max_output_per_class, iou_threshold,
+                           score_threshold)
+            return squeeze(
+                g,
+                select(
+                    g, nms_out, 1,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([2], dtype=torch.long))), 1)
+
+
+class SoftNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method,
+                offset):
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=int(method),
+            offset=int(offset))
+        return dets, inds
+
+    @staticmethod
+    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
+                 offset):
+        from packaging import version
+        assert version.parse(torch.__version__) >= version.parse('1.7.0')
+        nms_out = g.op(
+            'mmcv::SoftNonMaxSuppression',
+            boxes,
+            scores,
+            iou_threshold_f=float(iou_threshold),
+            sigma_f=float(sigma),
+            min_score_f=float(min_score),
+            method_i=int(method),
+            offset_i=int(offset),
+            outputs=2)
+        return nms_out
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
+    """Dispatch to either CPU or GPU NMS implementations.
+
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+        score_threshold (float): score threshold for NMS.
+        max_num (int): maximum number of boxes after NMS.
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+
+    if torch.__version__ == 'parrots':
+        indata_list = [boxes, scores]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'offset': int(offset)
+        }
+        inds = ext_module.nms(*indata_list, **indata_dict)
+    else:
+        inds = NMSop.apply(boxes, scores, iou_threshold, offset,
+                           score_threshold, max_num)
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def soft_nms(boxes,
+             scores,
+             iou_threshold=0.3,
+             sigma=0.5,
+             min_score=1e-3,
+             method='linear',
+             offset=0):
+    """Dispatch to only CPU Soft NMS implementations.
+
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+
+    Args:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+
+    if torch.__version__ == 'parrots':
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'sigma': float(sigma),
+            'min_score': min_score,
+            'method': method_dict[method],
+            'offset': int(offset)
+        }
+        inds = ext_module.softnms(*indata_list, **indata_dict)
+    else:
+        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
+                                     float(iou_threshold), float(sigma),
+                                     float(min_score), method_dict[method],
+                                     int(offset))
+
+    dets = dets[:inds.size(0)]
+
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+
+
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    r"""Performs non-maximum suppression in a batched fashion.
+
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict | None): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
+
+            - iou_thr (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class.
+
+    Returns:
+        tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
+    """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+        boxes_for_nms = boxes + offsets[:, None]
+
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
+        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+        # -1 indexing works abnormal in TensorRT
+        # This assumes `dets` has 5 dimensions where
+        # the last dimension is score.
+        # TODO: more elegant way to handle the dimension issue.
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores = dets[:, 4]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = dets[:, -1]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
+
+
+def nms_match(dets, iou_threshold):
+    """Matched dets into different groups by NMS.
+
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of suppressed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+
+    Args:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_thr (float): IoU thresh for NMS.
+
+    Returns:
+        list[torch.Tensor | np.ndarray]: The outer list corresponds different
+        matched group, the inner Tensor corresponds the indices for a group
+        in score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, torch.Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        indata_list = [dets_t]
+        indata_dict = {'iou_threshold': float(iou_threshold)}
+        matched = ext_module.nms_match(*indata_list, **indata_dict)
+        if torch.__version__ == 'parrots':
+            matched = matched.tolist()
+
+    if isinstance(dets, torch.Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=int) for m in matched]
+
+
+def nms_rotated(dets, scores, iou_threshold, labels=None, clockwise=True):
+    """Performs non-maximum suppression (NMS) on the rotated boxes according to
+    their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Args:
+        dets (Tensor):  Rotated boxes in shape (N, 5). They are expected to
+            be in (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (Tensor): boxes' label in shape (N,).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+    if not clockwise:
+        flip_mat = dets.new_ones(dets.shape[-1])
+        flip_mat[-1] = -1
+        dets_cw = dets * flip_mat
+    else:
+        dets_cw = dets
+    multi_label = labels is not None
+    if multi_label:
+        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)
+    else:
+        dets_wl = dets_cw
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_wl.index_select(0, order)
+
+    if torch.__version__ == 'parrots':
+        keep_inds = ext_module.nms_rotated(
+            dets_wl,
+            scores,
+            order,
+            dets_sorted,
+            iou_threshold=iou_threshold,
+            multi_label=multi_label)
+    else:
+        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
+                                           iou_threshold, multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/pixel_group.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..43dee1376d07de9c0e41e99f2a21f5185561e902
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/pixel_group.py
@@ -0,0 +1,89 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
+
+
+def pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                kernel_region_num, distance_threshold):
+    """Group pixels into text instances, which is widely used text detection
+    methods.
+
+    Arguments:
+        score (np.array or torch.Tensor): The foreground score with size hxw.
+        mask (np.array or Tensor): The foreground mask with size hxw.
+        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
+            distinguish instances.
+        kernel_label (np.array or torch.Tensor): The instance kernel index with
+            size hxw.
+        kernel_contour (np.array or torch.Tensor): The kernel contour with
+            size hxw.
+        kernel_region_num (int): The instance kernel region number.
+        distance_threshold (float): The embedding distance threshold between
+            kernel and pixel in one instance.
+
+    Returns:
+        list[list[float]]: The instance coordinates and attributes list. Each
+        element consists of averaged confidence, pixel number, and coordinates
+        (x_i, y_i for all pixels) in order.
+    """
+    assert isinstance(score, (torch.Tensor, np.ndarray))
+    assert isinstance(mask, (torch.Tensor, np.ndarray))
+    assert isinstance(embedding, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_region_num, int)
+    assert isinstance(distance_threshold, float)
+
+    if isinstance(score, np.ndarray):
+        score = torch.from_numpy(score)
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask)
+    if isinstance(embedding, np.ndarray):
+        embedding = torch.from_numpy(embedding)
+    if isinstance(kernel_label, np.ndarray):
+        kernel_label = torch.from_numpy(kernel_label)
+    if isinstance(kernel_contour, np.ndarray):
+        kernel_contour = torch.from_numpy(kernel_contour)
+
+    if torch.__version__ == 'parrots':
+        label = ext_module.pixel_group(
+            score,
+            mask,
+            embedding,
+            kernel_label,
+            kernel_contour,
+            kernel_region_num=kernel_region_num,
+            distance_threshold=distance_threshold)
+        label = label.tolist()
+        label = label[0]
+        list_index = kernel_region_num
+        pixel_assignment = []
+        for x in range(kernel_region_num):
+            pixel_assignment.append(
+                np.array(
+                    label[list_index:list_index + int(label[x])],
+                    dtype=np.float))
+            list_index = list_index + int(label[x])
+    else:
+        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
+                                                  kernel_label, kernel_contour,
+                                                  kernel_region_num,
+                                                  distance_threshold)
+    return pixel_assignment
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/point_sample.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..77f9e98b82b6a50d2f26dad7c02616f50d206ac0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/point_sample.py
@@ -0,0 +1,359 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os import path as osp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+from torch.onnx.operators import shape_as_tensor
+
+
+def bilinear_grid_sample(im, grid, align_corners=False):
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid. Supported only bilinear interpolation
+    method to sample the input pixels.
+
+    Args:
+        im (torch.Tensor): Input feature map, shape (N, C, H, W)
+        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
+        align_corners {bool}: If set to True, the extrema (-1 and 1) are
+            considered as referring to the center points of the input’s
+            corner pixels. If set to False, they are instead considered as
+            referring to the corner points of the input’s corner pixels,
+            making the sampling more resolution agnostic.
+
+    Returns:
+        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
+    """
+    n, c, h, w = im.shape
+    gn, gh, gw, _ = grid.shape
+    assert n == gn
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+
+    if align_corners:
+        x = ((x + 1) / 2) * (w - 1)
+        y = ((y + 1) / 2) * (h - 1)
+    else:
+        x = ((x + 1) * w - 1) / 2
+        y = ((y + 1) * h - 1) / 2
+
+    x = x.view(n, -1)
+    y = y.view(n, -1)
+
+    x0 = torch.floor(x).long()
+    y0 = torch.floor(y).long()
+    x1 = x0 + 1
+    y1 = y0 + 1
+
+    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
+    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
+    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
+    wd = ((x - x0) * (y - y0)).unsqueeze(1)
+
+    # Apply default for grid_sample function zero padding
+    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
+    padded_h = h + 2
+    padded_w = w + 2
+    # save points positions after padding
+    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
+
+    # Clip coordinates to padded image size
+    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
+
+    im_padded = im_padded.view(n, c, -1)
+
+    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+
+    Ia = torch.gather(im_padded, 2, x0_y0)
+    Ib = torch.gather(im_padded, 2, x0_y1)
+    Ic = torch.gather(im_padded, 2, x1_y0)
+    Id = torch.gather(im_padded, 2, x1_y1)
+
+    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
+
+
+def is_in_onnx_export_without_custom_ops():
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    return torch.onnx.is_in_onnx_export(
+    ) and not osp.exists(ort_custom_op_path)
+
+
+def normalize(grid):
+    """Normalize input grid from [-1, 1] to [0, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
+
+    Returns:
+        torch.Tensor: Normalized grid, range [0, 1].
+    """
+
+    return (grid + 1.0) / 2.0
+
+
+def denormalize(grid):
+    """Denormalize input grid from range [0, 1] to [-1, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
+
+    Returns:
+        torch.Tensor: Denormalized grid, range [-1, 1].
+    """
+
+    return grid * 2.0 - 1.0
+
+
+def generate_grid(num_grid, size, device):
+    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
+    space.
+
+    Args:
+        num_grid (int): The number of grids to sample, one for each region.
+        size (tuple[int, int]): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
+        contains coordinates for the regular grids.
+    """
+
+    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
+    grid = F.affine_grid(
+        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
+    grid = normalize(grid)
+    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
+
+
+def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+    Returns:
+        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
+    """
+
+    with torch.no_grad():
+        assert rel_roi_points.size(0) == rois.size(0)
+        assert rois.dim() == 2
+        assert rel_roi_points.dim() == 3
+        assert rel_roi_points.size(2) == 2
+        # remove batch idx
+        if rois.size(1) == 5:
+            rois = rois[:, 1:]
+        abs_img_points = rel_roi_points.clone()
+        # To avoid an error during exporting to onnx use independent
+        # variables instead inplace computation
+        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
+        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
+        xs += rois[:, None, 0]
+        ys += rois[:, None, 1]
+        abs_img_points = torch.stack([xs, ys], dim=2)
+    return abs_img_points
+
+
+def get_shape_from_feature_map(x):
+    """Get spatial resolution of input feature map considering exporting to
+    onnx mode.
+
+    Args:
+        x (torch.Tensor): Input tensor, shape (N, C, H, W)
+
+    Returns:
+        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
+    """
+    if torch.onnx.is_in_onnx_export():
+        img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to(
+            x.device).float()
+    else:
+        img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to(
+            x.device).float()
+    return img_shape
+
+
+def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
+    """Convert image based absolute point coordinates to image based relative
+    coordinates for sampling.
+
+    Args:
+        abs_img_points (torch.Tensor): Image based absolute point coordinates,
+            shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        Tensor: Image based relative point coordinates for sampling, shape
+        (N, P, 2).
+    """
+
+    assert (isinstance(img, tuple) and len(img) == 2) or \
+           (isinstance(img, torch.Tensor) and len(img.shape) == 4)
+
+    if isinstance(img, tuple):
+        h, w = img
+        scale = torch.tensor([w, h],
+                             dtype=torch.float,
+                             device=abs_img_points.device)
+        scale = scale.view(1, 1, 2)
+    else:
+        scale = get_shape_from_feature_map(img)
+
+    return abs_img_points / scale * spatial_scale
+
+
+def rel_roi_point_to_rel_img_point(rois,
+                                   rel_roi_points,
+                                   img,
+                                   spatial_scale=1.):
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        torch.Tensor: Image based relative point coordinates for sampling,
+        shape (N, P, 2).
+    """
+
+    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
+    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
+                                                   spatial_scale)
+
+    return rel_img_point
+
+
+def point_sample(input, points, align_corners=False, **kwargs):
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (torch.Tensor): Feature map, shape (N, C, H, W).
+        points (torch.Tensor): Image based absolute point coordinates
+            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
+            (N, Hgrid, Wgrid, 2).
+        align_corners (bool, optional): Whether align_corners.
+            Default: False
+
+    Returns:
+        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
+        (N, C, Hgrid, Wgrid).
+    """
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = points.unsqueeze(2)
+    if is_in_onnx_export_without_custom_ops():
+        # If custom ops for onnx runtime not compiled use python
+        # implementation of grid_sample function to make onnx graph
+        # with supported nodes
+        output = bilinear_grid_sample(
+            input, denormalize(points), align_corners=align_corners)
+    else:
+        output = F.grid_sample(
+            input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+class SimpleRoIAlign(nn.Module):
+
+    def __init__(self, output_size, spatial_scale, aligned=True):
+        """Simple RoI align in PointRend, faster than standard RoIAlign.
+
+        Args:
+            output_size (tuple[int]): h, w
+            spatial_scale (float): scale the input boxes by this number
+            aligned (bool): if False, use the legacy implementation in
+                MMDetection, align_corners=True will be used in F.grid_sample.
+                If True, align the results more perfectly.
+        """
+
+        super(SimpleRoIAlign, self).__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        # to be consistent with other RoI ops
+        self.use_torchvision = False
+        self.aligned = aligned
+
+    def forward(self, features, rois):
+        num_imgs = features.size(0)
+        num_rois = rois.size(0)
+        rel_roi_points = generate_grid(
+            num_rois, self.output_size, device=rois.device)
+
+        if torch.onnx.is_in_onnx_export():
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, features, self.spatial_scale)
+            rel_img_points = rel_img_points.reshape(num_imgs, -1,
+                                                    *rel_img_points.shape[1:])
+            point_feats = point_sample(
+                features, rel_img_points, align_corners=not self.aligned)
+            point_feats = point_feats.transpose(1, 2)
+        else:
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = features[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois[inds], rel_roi_points[inds], feat,
+                        self.spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(
+                        feat, rel_img_points, align_corners=not self.aligned)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+
+            point_feats = torch.cat(point_feats, dim=0)
+
+        channels = features.size(1)
+        roi_feats = point_feats.reshape(num_rois, channels, *self.output_size)
+
+        return roi_feats
+
+    def __repr__(self):
+        format_str = self.__class__.__name__
+        format_str += '(output_size={}, spatial_scale={}'.format(
+            self.output_size, self.spatial_scale)
+        return format_str
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_boxes.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0063e7f523a53d932d6687cf85bc43a34198af
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_boxes.py
@@ -0,0 +1,150 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
+    'points_in_boxes_all_forward'
+])
+
+
+def points_in_boxes_part(points, boxes):
+    """Find the box in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M). Default background = -1.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
+                                       dtype=torch.int).fill_(-1)
+
+    # If manually put the tensor 'points' or 'boxes' on a device
+    # which is not the current device, some temporary variables
+    # will be created on the current device in the cuda op,
+    # and the output will be incorrect.
+    # Therefore, we force the current device to be the same
+    # as the device of the tensors if it was not.
+    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
+    # for the incorrect output before the fix.
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+def points_in_boxes_cpu(points, boxes):
+    """Find all boxes in which each point is (CPU). The CPU version of
+    :meth:`points_in_boxes_all`.
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in
+            LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
+                                     dtype=torch.int)
+    for b in range(batch_size):
+        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
+                                               points[b].float().contiguous(),
+                                               point_indices[b])
+    point_indices = point_indices.transpose(1, 2)
+
+    return point_indices
+
+
+def points_in_boxes_all(points, boxes):
+    """Find all boxes in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+
+    # Same reason as line 25-32
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
+                                           points.contiguous(),
+                                           box_idxs_of_pts)
+
+    return box_idxs_of_pts
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_polygons.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4721807fa598fabc9dc4cee752b0ed3544c9192
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_in_polygons.py
@@ -0,0 +1,51 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
+
+
+def points_in_polygons(points, polygons):
+    """Judging whether points are inside polygons, which is used in the ATSS
+    assignment for the rotated boxes.
+
+    It should be noted that when the point is just at the polygon boundary, the
+    judgment will be inaccurate, but the effect on assignment is limited.
+
+    Args:
+        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
+            M means the number of predicted points.
+        polygons (torch.Tensor): It has shape (M, 8), indicating
+            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
+            ground truth polygons.
+
+    Returns:
+        torch.Tensor: Return the result with the shape of (B, M),
+        1 indicates that the point is inside the polygon,
+        0 indicates that the point is outside the polygon.
+    """
+    assert points.shape[1] == 2, \
+        'points dimension should be 2, ' \
+        f'but got unexpected shape {points.shape[1]}'
+    assert polygons.shape[1] == 8, \
+        'polygons dimension should be 8, ' \
+        f'but got unexpected shape {polygons.shape[1]}'
+    output = torch.full([points.shape[0], polygons.shape[0]],
+                        0.).cuda().float()
+    ext_module.points_in_polygons_forward(points.contiguous(),
+                                          polygons.contiguous(), output)
+    return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_sampler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ee99662eef07d3377698a5df847172220584a91
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/points_sampler.py
@@ -0,0 +1,192 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import torch
+from torch import nn as nn
+
+from mmcv.runner import force_fp32
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+
+
+def calc_square_dist(point_feat_a, point_feat_b, norm=True):
+    """Calculating square distance between a and b.
+
+    Args:
+        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
+        norm (bool, optional): Whether to normalize the distance.
+            Default: True.
+
+    Returns:
+        torch.Tensor: (B, N, M) Square distance between each point pair.
+    """
+    num_channel = point_feat_a.shape[-1]
+    # [bs, n, 1]
+    a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1)
+    # [bs, 1, m]
+    b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1)
+
+    corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2))
+
+    dist = a_square + b_square - 2 * corr_matrix
+    if norm:
+        dist = torch.sqrt(dist) / num_channel
+    return dist
+
+
+def get_sampler_cls(sampler_type):
+    """Get the type and mode of points sampler.
+
+    Args:
+        sampler_type (str): The type of points sampler.
+            The valid value are "D-FPS", "F-FPS", or "FS".
+
+    Returns:
+        class: Points sampler type.
+    """
+    sampler_mappings = {
+        'D-FPS': DFPSSampler,
+        'F-FPS': FFPSSampler,
+        'FS': FSSampler,
+    }
+    try:
+        return sampler_mappings[sampler_type]
+    except KeyError:
+        raise KeyError(
+            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
+                {sampler_type}')
+
+
+class PointsSampler(nn.Module):
+    """Points sampling.
+
+    Args:
+        num_point (list[int]): Number of sample points.
+        fps_mod_list (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+    """
+
+    def __init__(self,
+                 num_point: List[int],
+                 fps_mod_list: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1]):
+        super().__init__()
+        # FPS would be applied to different fps_mod in the list,
+        # so the length of the num_point should be equal to
+        # fps_mod_list and fps_sample_range_list.
+        assert len(num_point) == len(fps_mod_list) == len(
+            fps_sample_range_list)
+        self.num_point = num_point
+        self.fps_sample_range_list = fps_sample_range_list
+        self.samplers = nn.ModuleList()
+        for fps_mod in fps_mod_list:
+            self.samplers.append(get_sampler_cls(fps_mod)())
+        self.fp16_enabled = False
+
+    @force_fp32()
+    def forward(self, points_xyz, features):
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
+                the points.
+            features (torch.Tensor): (B, C, N) features of the points.
+
+        Returns:
+            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
+        """
+        indices = []
+        last_fps_end_index = 0
+
+        for fps_sample_range, sampler, npoint in zip(
+                self.fps_sample_range_list, self.samplers, self.num_point):
+            assert fps_sample_range < points_xyz.shape[1]
+
+            if fps_sample_range == -1:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:]
+                else:
+                    sample_features = None
+            else:
+                sample_points_xyz = \
+                    points_xyz[:, last_fps_end_index:fps_sample_range]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:
+                                               fps_sample_range]
+                else:
+                    sample_features = None
+
+            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
+                              npoint)
+
+            indices.append(fps_idx + last_fps_end_index)
+            last_fps_end_index += fps_sample_range
+        indices = torch.cat(indices, dim=1)
+
+        return indices
+
+
+class DFPSSampler(nn.Module):
+    """Using Euclidean distances of points for FPS."""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, points, features, npoint):
+        """Sampling points with D-FPS."""
+        fps_idx = furthest_point_sample(points.contiguous(), npoint)
+        return fps_idx
+
+
+class FFPSSampler(nn.Module):
+    """Using feature distances for FPS."""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, points, features, npoint):
+        """Sampling points with F-FPS."""
+        assert features is not None, \
+            'feature input to FFPS_Sampler should not be None'
+        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
+        features_dist = calc_square_dist(
+            features_for_fps, features_for_fps, norm=False)
+        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
+        return fps_idx
+
+
+class FSSampler(nn.Module):
+    """Using F-FPS and D-FPS simultaneously."""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, points, features, npoint):
+        """Sampling points with FS_Sampling."""
+        assert features is not None, \
+            'feature input to FS_Sampler should not be None'
+        ffps_sampler = FFPSSampler()
+        dfps_sampler = DFPSSampler()
+        fps_idx_ffps = ffps_sampler(points, features, npoint)
+        fps_idx_dfps = dfps_sampler(points, features, npoint)
+        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
+        return fps_idx
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/psa_mask.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b3f7a26210ca8fd384ef571d96c9b561aaf333
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/psa_mask.py
@@ -0,0 +1,105 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['psamask_forward', 'psamask_backward'])
+
+
+class PSAMaskFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, psa_type, mask_size):
+        return g.op(
+            'mmcv::MMCVPSAMask',
+            input,
+            psa_type_i=psa_type,
+            mask_size_i=mask_size)
+
+    @staticmethod
+    def forward(ctx, input, psa_type, mask_size):
+        ctx.psa_type = psa_type
+        ctx.mask_size = _pair(mask_size)
+        ctx.save_for_backward(input)
+
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        assert channels == h_mask * w_mask
+        output = input.new_zeros(
+            (batch_size, h_feature * w_feature, h_feature, w_feature))
+
+        ext_module.psamask_forward(
+            input,
+            output,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors[0]
+        psa_type = ctx.psa_type
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        grad_input = grad_output.new_zeros(
+            (batch_size, channels, h_feature, w_feature))
+        ext_module.psamask_backward(
+            grad_output,
+            grad_input,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return grad_input, None, None, None
+
+
+psa_mask = PSAMaskFunction.apply
+
+
+class PSAMask(nn.Module):
+
+    def __init__(self, psa_type, mask_size=None):
+        super(PSAMask, self).__init__()
+        assert psa_type in ['collect', 'distribute']
+        if psa_type == 'collect':
+            psa_type_enum = 0
+        else:
+            psa_type_enum = 1
+        self.psa_type_enum = psa_type_enum
+        self.mask_size = mask_size
+        self.psa_type = psa_type
+
+    def forward(self, input):
+        return psa_mask(input, self.psa_type_enum, self.mask_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(psa_type={self.psa_type}, '
+        s += f'mask_size={self.mask_size})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/riroi_align_rotated.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..84dff5ba8c2e613e3ab8d9aa8bd0c64fed7cc38f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/riroi_align_rotated.py
@@ -0,0 +1,145 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader, is_tuple_of
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
+
+
+class RiRoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                features,
+                rois,
+                out_size,
+                spatial_scale,
+                num_samples=0,
+                num_orientations=8,
+                clockwise=False):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif is_tuple_of(out_size, int):
+            assert len(out_size) == 2
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                f'"out_size" should be an integer or tuple of integers,'
+                f' but got {out_size}')
+        ctx.spatial_scale = spatial_scale
+        ctx.num_samples = num_samples
+        ctx.num_orientations = num_orientations
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, _, _ = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+
+        ext_module.riroi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            num_samples=num_samples,
+            num_orientations=num_orientations,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        num_orientations = ctx.num_orientations
+        clockwise = ctx.clockwise
+        num_samples = ctx.num_samples
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, feature_h, feature_w = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
+                                        feature_w)
+            ext_module.riroi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                num_samples=num_samples,
+                num_orientations=num_orientations,
+                clockwise=clockwise)
+
+            return grad_input, grad_rois, None, None, None, None, None
+
+
+riroi_align_rotated = RiRoIAlignRotatedFunction.apply
+
+
+class RiRoIAlignRotated(nn.Module):
+    """Rotation-invariant RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    The details are described in the paper `ReDet: A Rotation-equivariant
+    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
+
+    Args:
+        out_size (tuple): fixed dimensional RoI output with shape (h, w).
+        spatial_scale (float): scale the input boxes by this number
+        num_samples (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        num_orientations (int): number of oriented channels.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+    """
+
+    def __init__(self,
+                 out_size,
+                 spatial_scale,
+                 num_samples=0,
+                 num_orientations=8,
+                 clockwise=False):
+        super(RiRoIAlignRotated, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.num_samples = int(num_samples)
+        self.num_orientations = int(num_orientations)
+        self.clockwise = clockwise
+
+    def forward(self, features, rois):
+        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                               self.spatial_scale,
+                                               self.num_samples,
+                                               self.num_orientations,
+                                               self.clockwise)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed90fa4fb93316fc586abcb3b6f757a946e078c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align.py
@@ -0,0 +1,236 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import deprecated_api_warning, ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        if has_custom_op:
+            return g.op(
+                'mmcv::MMCVRoiAlign',
+                input,
+                rois,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode,
+                aligned_i=aligned)
+        else:
+            from torch.onnx.symbolic_opset9 import sub, squeeze
+            from torch.onnx.symbolic_helper import _slice_helper
+            from torch.onnx import TensorProtoDataType
+            # batch_indices = rois[:, 0].long()
+            batch_indices = _slice_helper(
+                g, rois, axes=[1], starts=[0], ends=[1])
+            batch_indices = squeeze(g, batch_indices, 1)
+            batch_indices = g.op(
+                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+            # rois = rois[:, 1:]
+            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
+            if aligned:
+                # rois -= 0.5/spatial_scale
+                aligned_offset = g.op(
+                    'Constant',
+                    value_t=torch.tensor([0.5 / spatial_scale],
+                                         dtype=torch.float32))
+                rois = sub(g, rois, aligned_offset)
+            # roi align
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=max(0, sampling_ratio),
+                mode_s=pool_mode)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                rois,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                pool_mode='avg',
+                aligned=True):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous.
+        grad_output = grad_output.contiguous()
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
+
+
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+        use_torchvision (bool): whether to use roi_align from torchvision.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlign')
+    def __init__(self,
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True,
+                 use_torchvision=False):
+        super(RoIAlign, self).__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.use_torchvision = use_torchvision
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        if self.use_torchvision:
+            from torchvision.ops import roi_align as tv_roi_align
+            if 'aligned' in tv_roi_align.__code__.co_varnames:
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio,
+                                    self.aligned)
+            else:
+                if self.aligned:
+                    rois -= rois.new_tensor([0.] +
+                                            [0.5 / self.spatial_scale] * 4)
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio)
+        else:
+            return roi_align(input, rois, self.output_size, self.spatial_scale,
+                             self.sampling_ratio, self.pool_mode, self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned}, '
+        s += f'use_torchvision={self.use_torchvision})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align_rotated.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..072c6e57dc3d8a84570842c83d83e3640ef48183
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_align_rotated.py
@@ -0,0 +1,190 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
+
+
+class RoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, rois, out_size, spatial_scale, sample_num,
+                 aligned, clockwise):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        return g.op(
+            'mmcv::MMCVRoIAlignRotated',
+            features,
+            rois,
+            output_height_i=out_h,
+            output_width_i=out_h,
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=sample_num,
+            aligned_i=aligned,
+            clockwise_i=clockwise)
+
+    @staticmethod
+    def forward(ctx,
+                features,
+                rois,
+                out_size,
+                spatial_scale,
+                sample_num=0,
+                aligned=True,
+                clockwise=False):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        ctx.spatial_scale = spatial_scale
+        ctx.sample_num = sample_num
+        ctx.aligned = aligned
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, data_height, data_width = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+        ext_module.roi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            sample_num=sample_num,
+            aligned=aligned,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        aligned = ctx.aligned
+        clockwise = ctx.clockwise
+        sample_num = ctx.sample_num
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, data_height, data_width = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
+                                        data_width)
+            ext_module.roi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                sample_num=sample_num,
+                aligned=aligned,
+                clockwise=clockwise)
+        return grad_input, grad_rois, None, None, None, None, None
+
+
+roi_align_rotated = RoIAlignRotatedFunction.apply
+
+
+class RoIAlignRotated(nn.Module):
+    """RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    Args:
+        out_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sample_num (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+            Default: True.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    def __init__(self,
+                 out_size,
+                 spatial_scale,
+                 sample_num=0,
+                 aligned=True,
+                 clockwise=False):
+        super(RoIAlignRotated, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.sample_num = int(sample_num)
+        self.aligned = aligned
+        self.clockwise = clockwise
+
+    def forward(self, features, rois):
+        return RoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                             self.spatial_scale,
+                                             self.sample_num, self.aligned,
+                                             self.clockwise)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_pool.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad4a91161fdd9c625f0949b8ec295453e1647f0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roi_pool.py
@@ -0,0 +1,99 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_pool_forward', 'roi_pool_backward'])
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale):
+        return g.op(
+            'MaxRoiPool',
+            input,
+            rois,
+            pooled_shape_i=output_size,
+            spatial_scale_f=spatial_scale)
+
+    @staticmethod
+    def forward(ctx, input, rois, output_size, spatial_scale=1.0):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        argmax = input.new_zeros(output_shape, dtype=torch.int)
+
+        ext_module.roi_pool_forward(
+            input,
+            rois,
+            output,
+            argmax,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        ctx.save_for_backward(rois, argmax)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, argmax = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+
+        ext_module.roi_pool_backward(
+            grad_output,
+            rois,
+            argmax,
+            grad_input,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        return grad_input, None, None, None
+
+
+roi_pool = RoIPoolFunction.apply
+
+
+class RoIPool(nn.Module):
+
+    def __init__(self, output_size, spatial_scale=1.0):
+        super(RoIPool, self).__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, input, rois):
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roiaware_pool3d.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fd8d1df2f792b8cdad3d96c641f16070856ed
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roiaware_pool3d.py
@@ -0,0 +1,136 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+import mmcv
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])
+
+
+class RoIAwarePool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
+    details.
+
+    Args:
+        out_size (int or tuple): The size of output features. n or
+            [n1, n2, n3].
+        max_pts_per_voxel (int, optional): The maximum number of points per
+            voxel. Default: 128.
+        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
+            Default: 'max'.
+    """
+
+    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
+        super().__init__()
+
+        self.out_size = out_size
+        self.max_pts_per_voxel = max_pts_per_voxel
+        assert mode in ['max', 'avg']
+        pool_mapping = {'max': 0, 'avg': 1}
+        self.mode = pool_mapping[mode]
+
+    def forward(self, rois, pts, pts_feature):
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
+                                            self.out_size,
+                                            self.max_pts_per_voxel, self.mode)
+
+
+class RoIAwarePool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
+                mode):
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+            out_size (int or tuple): The size of output features. n or
+                [n1, n2, n3].
+            max_pts_per_voxel (int): The maximum number of points per voxel.
+                Default: 128.
+            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
+                pool).
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            assert mmcv.is_tuple_of(out_size, int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
+            dtype=torch.int)
+
+        ext_module.roiaware_pool3d_forward(
+            rois,
+            pts,
+            pts_feature,
+            argmax,
+            pts_idx_of_voxels,
+            pooled_features,
+            pool_method=mode)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
+                                            num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        ret = ctx.roiaware_pool3d_for_backward
+        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        ext_module.roiaware_pool3d_backward(
+            pts_idx_of_voxels,
+            argmax,
+            grad_out.contiguous(),
+            grad_in,
+            pool_method=mode)
+
+        return None, None, grad_in, None, None, None
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roipoint_pool3d.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0ad13aaa36be9d8621aac6f1cc2f01206ec91aa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/roipoint_pool3d.py
@@ -0,0 +1,91 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])
+
+
+class RoIPointPool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
+    for more details.
+
+    Args:
+        num_sampled_points (int, optional): Number of samples in each roi.
+            Default: 512.
+    """
+
+    def __init__(self, num_sampled_points=512):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points, point_features, boxes3d):
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
+                                            self.num_sampled_points)
+
+
+class RoIPointPool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+            num_sampled_points (int, optional): The num of sampled points.
+                Default: 512.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        assert len(points.shape) == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
+            1], point_features.shape[2]
+        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
+        pooled_features = point_features.new_zeros(
+            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros(
+            (batch_size, boxes_num)).int()
+
+        ext_module.roipoint_pool3d_forward(points.contiguous(),
+                                           pooled_boxes3d.contiguous(),
+                                           point_features.contiguous(),
+                                           pooled_features, pooled_empty_flag)
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        raise NotImplementedError
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/rotated_feature_align.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..96da9788c1d70e15502a900ba1752d918129934b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/rotated_feature_align.py
@@ -0,0 +1,95 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
+
+
+class RotatedFeatureAlignFunction(Function):
+    """Using the feature interpolation to obtain the position information
+    correspond to the refined rotate anchors and reconstruct the feature maps
+    in pixel-wise manner to achieve feature alignment.
+
+    The details are described in the paper
+    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
+    Object <https://arxiv.org/abs/1908.05612>`_.
+    """
+
+    @staticmethod
+    def forward(ctx, features, best_rbboxes, spatial_scale, points):
+        """
+        Args:
+            features (torch.Tensor): Input features with shape [N,C,H,W].
+            best_rbboxes (torch.Tensor): Refined rotate anchors with
+                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
+            spatial_scale (float): The scale of feature map size and
+                input image size.
+            points (int, optional): The number of sample points.
+                Only 1 and 5 are supported. Defaults to 1.
+
+        Returns:
+            torch.Tensor: Refined features with shape [N,C,H,W].
+        """
+        ctx.spatial_scale = spatial_scale
+        ctx.points = points
+        ctx.save_for_backward(best_rbboxes)
+        assert points in [1, 5]
+        output = torch.zeros_like(features)
+        ext_module.rotated_feature_align_forward(
+            features,
+            best_rbboxes,
+            output,
+            spatial_scale=spatial_scale,
+            points=points)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [N,C,H,W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape [N,C,H,W].
+        """
+        best_rbboxes = ctx.saved_tensors[0]
+        points = ctx.points
+        spatial_scale = ctx.spatial_scale
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = torch.zeros_like(grad_output)
+            ext_module.rotated_feature_align_backward(
+                grad_output.contiguous(),
+                best_rbboxes,
+                grad_input,
+                spatial_scale=spatial_scale,
+                points=points)
+        return grad_input, None, None, None
+
+
+def rotated_feature_align(features,
+                          best_rbboxes,
+                          spatial_scale=1 / 8,
+                          points=1):
+    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
+                                             spatial_scale, points)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/saconv.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd6e6640d954b13b43d701078155550a7a3dcc80
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/saconv.py
@@ -0,0 +1,159 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init
+from mmcv.ops.deform_conv import deform_conv2d
+from mmcv.utils import TORCH_VERSION, digit_version
+
+
+@CONV_LAYERS.register_module(name='SAC')
+class SAConv2d(ConvAWS2d):
+    """SAC (Switchable Atrous Convolution)
+
+    This is an implementation of `DetectoRS: Detecting Objects with Recursive
+    Feature Pyramid and Switchable Atrous Convolution
+    <https://arxiv.org/abs/2006.02334>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        use_deform: If ``True``, replace convolution with deformable
+            convolution. Default: ``False``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 use_deform=False):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.use_deform = use_deform
+        self.switch = nn.Conv2d(
+            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
+        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
+        self.pre_context = nn.Conv2d(
+            self.in_channels, self.in_channels, kernel_size=1, bias=True)
+        self.post_context = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=1, bias=True)
+        if self.use_deform:
+            self.offset_s = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+            self.offset_l = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        constant_init(self.switch, 0, bias=1)
+        self.weight_diff.data.zero_()
+        constant_init(self.pre_context, 0)
+        constant_init(self.post_context, 0)
+        if self.use_deform:
+            constant_init(self.offset_s, 0)
+            constant_init(self.offset_l, 0)
+
+    def forward(self, x):
+        # pre-context
+        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
+        avg_x = self.pre_context(avg_x)
+        avg_x = avg_x.expand_as(x)
+        x = x + avg_x
+        # switch
+        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
+        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
+        switch = self.switch(avg_x)
+        # sac
+        weight = self._get_weight(self.weight)
+        zero_bias = torch.zeros(
+            self.out_channels, device=weight.device, dtype=weight.dtype)
+
+        if self.use_deform:
+            offset = self.offset_s(avg_x)
+            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_s = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_s = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_s = super()._conv_forward(x, weight)
+        ori_p = self.padding
+        ori_d = self.dilation
+        self.padding = tuple(3 * p for p in self.padding)
+        self.dilation = tuple(3 * d for d in self.dilation)
+        weight = weight + self.weight_diff
+        if self.use_deform:
+            offset = self.offset_l(avg_x)
+            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_l = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_l = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_l = super()._conv_forward(x, weight)
+
+        out = switch * out_s + (1 - switch) * out_l
+        self.padding = ori_p
+        self.dilation = ori_d
+        # post-context
+        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
+        avg_x = self.post_context(avg_x)
+        avg_x = avg_x.expand_as(out)
+        out = out + avg_x
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/scatter_points.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d161d01db8915406641d6c1d698178e49b5bcd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/scatter_points.py
@@ -0,0 +1,150 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
+
+
+class _DynamicScatter(Function):
+
+    @staticmethod
+    def forward(ctx, feats, coors, reduce_type='max'):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats (torch.Tensor): [N, C]. Points features to be reduced
+                into voxels.
+            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
+                'mean'. Default: 'max'.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        results = ext_module.dynamic_point_to_voxel_forward(
+            feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        ext_module.dynamic_point_to_voxel_backward(
+            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
+            point2voxel_map, voxel_points_count, ctx.reduce_type)
+        return grad_feats, None, None
+
+
+dynamic_scatter = _DynamicScatter.apply
+
+
+class DynamicScatter(nn.Module):
+    """Scatters points into voxels, used in the voxel encoder with dynamic
+    voxelization.
+
+    Note:
+        The CPU and GPU implementation get the same output, but have numerical
+        difference after summation and division (e.g., 5e-7).
+
+    Args:
+        voxel_size (list): list [x, y, z] size of three dimension.
+        point_cloud_range (list): The coordinate range of points, [x_min,
+            y_min, z_min, x_max, y_max, z_max].
+        average_points (bool): whether to use avg pooling to scatter points
+            into voxel.
+    """
+
+    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(self, points, coors):
+        """Scatters points into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
+
+    def forward(self, points, coors):
+        """Scatters points/features into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', average_points=' + str(self.average_points)
+        s += ')'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/sync_bn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cfe56ab9dce317a3594f4edb2e144a840c139f8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/sync_bn.py
@@ -0,0 +1,292 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.module import Module
+from torch.nn.parameter import Parameter
+
+from mmcv.cnn import NORM_LAYERS
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
+    'sync_bn_backward_param', 'sync_bn_backward_data'
+])
+
+
+class SyncBatchNormFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
+                 eps, group, group_size, stats_mode):
+        return g.op(
+            'mmcv::MMCVSyncBatchNorm',
+            input,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            momentum_f=momentum,
+            eps_f=eps,
+            group_i=group,
+            group_size_i=group_size,
+            stats_mode=stats_mode)
+
+    @staticmethod
+    def forward(self, input, running_mean, running_var, weight, bias, momentum,
+                eps, group, group_size, stats_mode):
+        self.momentum = momentum
+        self.eps = eps
+        self.group = group
+        self.group_size = group_size
+        self.stats_mode = stats_mode
+
+        assert isinstance(
+                   input, (torch.HalfTensor, torch.FloatTensor,
+                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
+               f'only support Half or Float Tensor, but {input.type()}'
+        output = torch.zeros_like(input)
+        input3d = input.flatten(start_dim=2)
+        output3d = output.view_as(input3d)
+        num_channels = input3d.size(1)
+
+        # ensure mean/var/norm/std are initialized as zeros
+        # ``torch.empty()`` does not guarantee that
+        mean = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        var = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        norm = torch.zeros_like(
+            input3d, dtype=torch.float, device=input3d.device)
+        std = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+
+        batch_size = input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_forward_mean(input3d, mean)
+            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
+        else:
+            # skip updating mean and leave it as zeros when the input is empty
+            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)
+
+        # synchronize mean and the batch flag
+        vec = torch.cat([mean, batch_flag])
+        if self.stats_mode == 'N':
+            vec *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(vec, group=self.group)
+        total_batch = vec[-1].detach()
+        mean = vec[:num_channels]
+
+        if self.stats_mode == 'default':
+            mean = mean / self.group_size
+        elif self.stats_mode == 'N':
+            mean = mean / total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # leave var as zeros when the input is empty
+        if batch_size > 0:
+            ext_module.sync_bn_forward_var(input3d, mean, var)
+
+        if self.stats_mode == 'N':
+            var *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(var, group=self.group)
+
+        if self.stats_mode == 'default':
+            var /= self.group_size
+        elif self.stats_mode == 'N':
+            var /= total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # if the total batch size over all the ranks is zero,
+        # we should not update the statistics in the current batch
+        update_flag = total_batch.clamp(max=1)
+        momentum = update_flag * self.momentum
+        ext_module.sync_bn_forward_output(
+            input3d,
+            mean,
+            var,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            norm,
+            std,
+            output3d,
+            eps=self.eps,
+            momentum=momentum,
+            group_size=self.group_size)
+        self.save_for_backward(norm, std, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(self, grad_output):
+        norm, std, weight = self.saved_tensors
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(weight)
+        grad_input = torch.zeros_like(grad_output)
+        grad_output3d = grad_output.flatten(start_dim=2)
+        grad_input3d = grad_input.view_as(grad_output3d)
+
+        batch_size = grad_input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
+                                              grad_bias)
+
+        # all reduce
+        if self.group_size > 1:
+            dist.all_reduce(grad_weight, group=self.group)
+            dist.all_reduce(grad_bias, group=self.group)
+            grad_weight /= self.group_size
+            grad_bias /= self.group_size
+
+        if batch_size > 0:
+            ext_module.sync_bn_backward_data(grad_output3d, weight,
+                                             grad_weight, grad_bias, norm, std,
+                                             grad_input3d)
+
+        return grad_input, None, None, grad_weight, grad_bias, \
+            None, None, None, None, None
+
+
+@NORM_LAYERS.register_module(name='MMSyncBN')
+class SyncBatchNorm(Module):
+    """Synchronized Batch Normalization.
+
+    Args:
+        num_features (int): number of features/chennels in input tensor
+        eps (float, optional): a value added to the denominator for numerical
+            stability. Defaults to 1e-5.
+        momentum (float, optional): the value used for the running_mean and
+            running_var computation. Defaults to 0.1.
+        affine (bool, optional): whether to use learnable affine parameters.
+            Defaults to True.
+        track_running_stats (bool, optional): whether to track the running
+            mean and variance during training. When set to False, this
+            module does not track such statistics, and initializes statistics
+            buffers ``running_mean`` and ``running_var`` as ``None``. When
+            these buffers are ``None``, this module always uses batch
+            statistics in both training and eval modes. Defaults to True.
+        group (int, optional): synchronization of stats happen within
+            each process group individually. By default it is synchronization
+            across the whole world. Defaults to None.
+        stats_mode (str, optional): The statistical mode. Available options
+            includes ``'default'`` and ``'N'``. Defaults to 'default'.
+            When ``stats_mode=='default'``, it computes the overall statistics
+            using those from each worker with equal weight, i.e., the
+            statistics are synchronized and simply divied by ``group``. This
+            mode will produce inaccurate statistics when empty tensors occur.
+            When ``stats_mode=='N'``, it compute the overall statistics using
+            the total number of batches in each worker ignoring the number of
+            group, i.e., the statistics are synchronized and then divied by
+            the total batch ``N``. This mode is beneficial when empty tensors
+            occur during training, as it average the total mean by the real
+            number of batch.
+    """
+
+    def __init__(self,
+                 num_features,
+                 eps=1e-5,
+                 momentum=0.1,
+                 affine=True,
+                 track_running_stats=True,
+                 group=None,
+                 stats_mode='default'):
+        super(SyncBatchNorm, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        group = dist.group.WORLD if group is None else group
+        self.group = group
+        self.group_size = dist.get_world_size(group)
+        assert stats_mode in ['default', 'N'], \
+            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
+        self.stats_mode = stats_mode
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_buffer('running_mean', None)
+            self.register_buffer('running_var', None)
+            self.register_buffer('num_batches_tracked', None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()  # pytorch use ones_()
+            self.bias.data.zero_()
+
+    def forward(self, input):
+        if input.dim() < 2:
+            raise ValueError(
+                f'expected at least 2D input, got {input.dim()}D input')
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(
+                        self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        if self.training or not self.track_running_stats:
+            return SyncBatchNormFunction.apply(
+                input, self.running_mean, self.running_var, self.weight,
+                self.bias, exponential_average_factor, self.eps, self.group,
+                self.group_size, self.stats_mode)
+        else:
+            return F.batch_norm(input, self.running_mean, self.running_var,
+                                self.weight, self.bias, False,
+                                exponential_average_factor, self.eps)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'({self.num_features}, '
+        s += f'eps={self.eps}, '
+        s += f'momentum={self.momentum}, '
+        s += f'affine={self.affine}, '
+        s += f'track_running_stats={self.track_running_stats}, '
+        s += f'group_size={self.group_size},'
+        s += f'stats_mode={self.stats_mode})'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_interpolate.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba9f69380ce2c17ee76ddbda5e62fcb57458ffa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_interpolate.py
@@ -0,0 +1,83 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])
+
+
+class ThreeInterpolate(Function):
+    """Performs weighted linear interpolation on 3 features.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
+                weight: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, M) Features descriptors to be
+                interpolated.
+            indices (torch.Tensor): (B, n, 3) indices of three nearest
+                neighbor features for the target features.
+            weight (torch.Tensor): (B, n, 3) weights of three nearest
+                neighbor features for the target features.
+
+        Returns:
+            torch.Tensor: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = indices.size(1)
+        ctx.three_interpolate_for_backward = (indices, weight, m)
+        output = torch.cuda.FloatTensor(B, c, n)
+
+        ext_module.three_interpolate_forward(
+            features, indices, weight, output, b=B, c=c, m=m, n=n)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
+
+        Returns:
+            torch.Tensor: (B, C, M) tensor with gradients of features
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
+        grad_out_data = grad_out.data.contiguous()
+
+        ext_module.three_interpolate_backward(
+            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_nn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..bce2181fea8436bb5827a5bd0f00cc7649354b98
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/three_nn.py
@@ -0,0 +1,65 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])
+
+
+class ThreeNN(Function):
+    """Find the top-3 nearest neighbors of the target set from the source set.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx, target: torch.Tensor,
+                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            target (torch.Tensor): shape (B, N, 3), points set that needs to
+                find the nearest neighbors.
+            source (torch.Tensor): shape (B, M, 3), points set that is used
+                to find the nearest neighbors of points in target set.
+
+        Returns:
+            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
+            set to their corresponding top three nearest neighbors.
+        """
+        target = target.contiguous()
+        source = source.contiguous()
+
+        B, N, _ = target.size()
+        m = source.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
+
+        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/tin_shift.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/tin_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..0befc618b07f3061004ff450d9b164458ef9bd38
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/tin_shift.py
@@ -0,0 +1,84 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Code reference from "Temporal Interlacing Network"
+# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
+# Hao Shao, Shengju Qian, Yu Liu
+# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['tin_shift_forward', 'tin_shift_backward'])
+
+
+class TINShiftFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, shift):
+        C = input.size(2)
+        num_segments = shift.size(1)
+        if C // num_segments <= 0 or C % num_segments != 0:
+            raise ValueError('C should be a multiple of num_segments, '
+                             f'but got C={C} and num_segments={num_segments}.')
+
+        ctx.save_for_backward(shift)
+
+        out = torch.zeros_like(input)
+        ext_module.tin_shift_forward(input, shift, out)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        shift = ctx.saved_tensors[0]
+        data_grad_input = grad_output.new(*grad_output.size()).zero_()
+        shift_grad_input = shift.new(*shift.size()).zero_()
+        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)
+
+        return data_grad_input, shift_grad_input
+
+
+tin_shift = TINShiftFunction.apply
+
+
+class TINShift(nn.Module):
+    """Temporal Interlace Shift.
+
+    Temporal Interlace shift is a differentiable temporal-wise frame shifting
+    which is proposed in "Temporal Interlacing Network"
+
+    Please refer to `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_ for more details.
+
+    Code is modified from https://github.com/mit-han-lab/temporal-shift-module
+    """
+
+    def forward(self, input, shift):
+        """Perform temporal interlace shift.
+
+        Args:
+            input (torch.Tensor): Feature map with shape
+                [N, num_segments, C, H * W].
+            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
+
+        Returns:
+            Feature map after temporal interlace shift.
+        """
+        return tin_shift(input, shift)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/upfirdn2d.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..858172bc0204bc69263d5052d866087577e53e68
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/upfirdn2d.py
@@ -0,0 +1,250 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =======================================================================
+
+import torch
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from mmcv.utils import to_2tuple
+from ..utils import ext_loader
+
+upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d'])
+
+
+class UpFirDn2dBackward(Function):
+
+    @staticmethod
+    def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad,
+                in_size, out_size):
+
+        up_x, up_y = up
+        down_x, down_y = down
+        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
+
+        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
+
+        grad_input = upfirdn2d_ext.upfirdn2d(
+            grad_output,
+            grad_kernel,
+            up_x=down_x,
+            up_y=down_y,
+            down_x=up_x,
+            down_y=up_y,
+            pad_x0=g_pad_x0,
+            pad_x1=g_pad_x1,
+            pad_y0=g_pad_y0,
+            pad_y1=g_pad_y1)
+        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2],
+                                     in_size[3])
+
+        ctx.save_for_backward(kernel)
+
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        ctx.up_x = up_x
+        ctx.up_y = up_y
+        ctx.down_x = down_x
+        ctx.down_y = down_y
+        ctx.pad_x0 = pad_x0
+        ctx.pad_x1 = pad_x1
+        ctx.pad_y0 = pad_y0
+        ctx.pad_y1 = pad_y1
+        ctx.in_size = in_size
+        ctx.out_size = out_size
+
+        return grad_input
+
+    @staticmethod
+    def backward(ctx, gradgrad_input):
+        kernel, = ctx.saved_tensors
+
+        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
+                                                ctx.in_size[3], 1)
+
+        gradgrad_out = upfirdn2d_ext.upfirdn2d(
+            gradgrad_input,
+            kernel,
+            up_x=ctx.up_x,
+            up_y=ctx.up_y,
+            down_x=ctx.down_x,
+            down_y=ctx.down_y,
+            pad_x0=ctx.pad_x0,
+            pad_x1=ctx.pad_x1,
+            pad_y0=ctx.pad_y0,
+            pad_y1=ctx.pad_y1)
+        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0],
+        #                                  ctx.out_size[1], ctx.in_size[3])
+        gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1],
+                                         ctx.out_size[0], ctx.out_size[1])
+
+        return gradgrad_out, None, None, None, None, None, None, None, None
+
+
+class UpFirDn2d(Function):
+
+    @staticmethod
+    def forward(ctx, input, kernel, up, down, pad):
+        up_x, up_y = up
+        down_x, down_y = down
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        kernel_h, kernel_w = kernel.shape
+        batch, channel, in_h, in_w = input.shape
+        ctx.in_size = input.shape
+
+        input = input.reshape(-1, in_h, in_w, 1)
+
+        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
+
+        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+        ctx.out_size = (out_h, out_w)
+
+        ctx.up = (up_x, up_y)
+        ctx.down = (down_x, down_y)
+        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
+
+        g_pad_x0 = kernel_w - pad_x0 - 1
+        g_pad_y0 = kernel_h - pad_y0 - 1
+        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
+        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
+
+        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
+
+        out = upfirdn2d_ext.upfirdn2d(
+            input,
+            kernel,
+            up_x=up_x,
+            up_y=up_y,
+            down_x=down_x,
+            down_y=down_y,
+            pad_x0=pad_x0,
+            pad_x1=pad_x1,
+            pad_y0=pad_y0,
+            pad_y1=pad_y1)
+        # out = out.view(major, out_h, out_w, minor)
+        out = out.view(-1, channel, out_h, out_w)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        kernel, grad_kernel = ctx.saved_tensors
+
+        grad_input = UpFirDn2dBackward.apply(
+            grad_output,
+            kernel,
+            grad_kernel,
+            ctx.up,
+            ctx.down,
+            ctx.pad,
+            ctx.g_pad,
+            ctx.in_size,
+            ctx.out_size,
+        )
+
+        return grad_input, None, None, None, None
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    """UpFRIDn for 2d features.
+
+    UpFIRDn is short for upsample, apply FIR filter and downsample. More
+    details can be found in:
+    https://www.mathworks.com/help/signal/ref/upfirdn.html
+
+    Args:
+        input (torch.Tensor): Tensor with shape of (n, c, h, w).
+        kernel (torch.Tensor): Filter kernel.
+        up (int | tuple[int], optional): Upsampling factor. If given a number,
+            we will use this factor for the both height and width side.
+            Defaults to 1.
+        down (int | tuple[int], optional): Downsampling factor. If given a
+            number, we will use this factor for the both height and width side.
+            Defaults to 1.
+        pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or
+            (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0).
+
+    Returns:
+        torch.Tensor: Tensor after UpFIRDn.
+    """
+    if input.device.type == 'cpu':
+        if len(pad) == 2:
+            pad = (pad[0], pad[1], pad[0], pad[1])
+
+        up = to_2tuple(up)
+
+        down = to_2tuple(down)
+
+        out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1],
+                               pad[0], pad[1], pad[2], pad[3])
+    else:
+        _up = to_2tuple(up)
+
+        _down = to_2tuple(down)
+
+        if len(pad) == 4:
+            _pad = pad
+        elif len(pad) == 2:
+            _pad = (pad[0], pad[1], pad[0], pad[1])
+
+        out = UpFirDn2d.apply(input, kernel, _up, _down, _pad)
+
+    return out
+
+
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+                     pad_y0, pad_y1):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(
+        out,
+        [0, 0,
+         max(pad_x0, 0),
+         max(pad_x1, 0),
+         max(pad_y0, 0),
+         max(pad_y1, 0)])
+    out = out[:,
+              max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0),
+              max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape(
+        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/voxelize.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..160ad85864658e2185304992e627531fc074be3c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/ops/voxelize.py
@@ -0,0 +1,157 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000):
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+
+        Returns:
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+    """Convert kitti points(N, >=3) to voxels.
+
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
+
+    Args:
+        voxel_size (tuple or float): The size of voxel with the shape of [3].
+        point_cloud_range (tuple or float): The coordinate range of voxel with
+            the shape of [6].
+        max_num_points (int): maximum points contained in a voxel. if
+            max_points=-1, it means using dynamic_voxelize.
+        max_voxels (int, optional): maximum voxels this function create.
+            for second, 20000 is a good choice. Users should shuffle points
+            before call this function because max_voxels may drop points.
+            Default: 20000.
+    """
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input):
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ')'
+        return s
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19de79a6b97e3eda64d8ad27ec6b971d9cd41de9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/__init__.py
@@ -0,0 +1,26 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .collate import collate
+from .data_container import DataContainer
+from .data_parallel import MMDataParallel
+from .distributed import MMDistributedDataParallel
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import scatter, scatter_kwargs
+from .utils import is_module_wrapper
+
+__all__ = [
+    'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
+    'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/_functions.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0de5620f6348cc39d971e9e17918fa2ba20e46
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/_functions.py
@@ -0,0 +1,89 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.nn.parallel._functions import _get_stream
+
+
+def scatter(input, devices, streams=None):
+    """Scatters tensor across multiple GPUs."""
+    if streams is None:
+        streams = [None] * len(devices)
+
+    if isinstance(input, list):
+        chunk_size = (len(input) - 1) // len(devices) + 1
+        outputs = [
+            scatter(input[i], [devices[i // chunk_size]],
+                    [streams[i // chunk_size]]) for i in range(len(input))
+        ]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        # TODO: copy to a pinned buffer first (if copying from CPU)
+        stream = streams[0] if output.numel() > 0 else None
+        if devices != [-1]:
+            with torch.npu.device(devices[0]), torch.npu.stream(stream):
+                output = output.npu(devices[0], non_blocking=True)
+
+        return output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+def synchronize_stream(output, devices, streams):
+    if isinstance(output, list):
+        chunk_size = len(output) // len(devices)
+        for i in range(len(devices)):
+            for j in range(chunk_size):
+                synchronize_stream(output[i * chunk_size + j], [devices[i]],
+                                   [streams[i]])
+    elif isinstance(output, torch.Tensor):
+        if output.numel() != 0:
+            with torch.npu.device(devices[0]):
+                main_stream = torch.npu.current_stream()
+                main_stream.wait_stream(streams[0])
+                output.record_stream(main_stream)
+    else:
+        raise Exception(f'Unknown type {type(output)}.')
+
+
+def get_input_device(input):
+    if isinstance(input, list):
+        for item in input:
+            input_device = get_input_device(item)
+            if input_device != -1:
+                return input_device
+        return -1
+    elif isinstance(input, torch.Tensor):
+        return input.get_device() if input.is_npu else -1
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_gpus, input):
+        input_device = get_input_device(input)
+        streams = None
+        if input_device == -1 and target_gpus != [-1]:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(device) for device in target_gpus]
+
+        outputs = scatter(input, target_gpus, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            synchronize_stream(outputs, target_gpus, streams)
+
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/collate.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb35c755b7f6c628942663f8721afc0c3b6daa34
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/collate.py
@@ -0,0 +1,97 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Mapping, Sequence
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .data_container import DataContainer
+
+
+def collate(batch, samples_per_gpu=1):
+    """Puts each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    Extend default_collate to add support for
+    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data
+    2. cpu_only = False, stack = True, e.g., images tensors
+    3. cpu_only = False, stack = False, e.g., gt bboxes
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(f'{batch.dtype} is not supported.')
+
+    if isinstance(batch[0], DataContainer):
+        stacked = []
+        if batch[0].cpu_only:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+            return DataContainer(
+                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
+        elif batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+
+                if batch[i].pad_dims is not None:
+                    ndim = batch[i].dim()
+                    assert ndim > batch[i].pad_dims
+                    max_shape = [0 for _ in range(batch[i].pad_dims)]
+                    for dim in range(1, batch[i].pad_dims + 1):
+                        max_shape[dim - 1] = batch[i].size(-dim)
+                    for sample in batch[i:i + samples_per_gpu]:
+                        for dim in range(0, ndim - batch[i].pad_dims):
+                            assert batch[i].size(dim) == sample.size(dim)
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            max_shape[dim - 1] = max(max_shape[dim - 1],
+                                                     sample.size(-dim))
+                    padded_samples = []
+                    for sample in batch[i:i + samples_per_gpu]:
+                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            pad[2 * dim -
+                                1] = max_shape[dim - 1] - sample.size(-dim)
+                        padded_samples.append(
+                            F.pad(
+                                sample.data, pad, value=sample.padding_value))
+                    stacked.append(default_collate(padded_samples))
+                elif batch[i].pad_dims is None:
+                    stacked.append(
+                        default_collate([
+                            sample.data
+                            for sample in batch[i:i + samples_per_gpu]
+                        ]))
+                else:
+                    raise ValueError(
+                        'pad_dims should be either None or integers (1-3)')
+
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        return [collate(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], Mapping):
+        return {
+            key: collate([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_container.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10e939213aea534afca03c99d6ad12aa4bfcd72
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_container.py
@@ -0,0 +1,102 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import torch
+
+
+def assert_tensor_type(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError(
+                f'{args[0].__class__.__name__} has no attribute '
+                f'{func.__name__} for type {args[0].datatype}')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer:
+    """A container for any type of objects.
+
+    Typically tensors will be stacked in the collate function and sliced along
+    some dimension in the scatter function. This behavior has some limitations.
+    1. All tensors have to be the same size.
+    2. Types are limited (numpy array or Tensor).
+
+    We design `DataContainer` and `MMDataParallel` to overcome these
+    limitations. The behavior can be either of the following.
+
+    - copy to GPU, pad all tensors to the same size and stack them
+    - copy to GPU without stacking
+    - leave the objects as is and pass it to the model
+    - pad_dims specifies the number of last few dimensions to do padding
+    """
+
+    def __init__(self,
+                 data,
+                 stack=False,
+                 padding_value=0,
+                 cpu_only=False,
+                 pad_dims=2):
+        self._data = data
+        self._cpu_only = cpu_only
+        self._stack = stack
+        self._padding_value = padding_value
+        assert pad_dims in [None, 1, 2, 3]
+        self._pad_dims = pad_dims
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({repr(self.data)})'
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def datatype(self):
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def cpu_only(self):
+        return self._cpu_only
+
+    @property
+    def stack(self):
+        return self._stack
+
+    @property
+    def padding_value(self):
+        return self._padding_value
+
+    @property
+    def pad_dims(self):
+        return self._pad_dims
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs):
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self):
+        return self.data.dim()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_parallel.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..084c67f4ab55be219ead02f18cb266f0927fdc18
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/data_parallel.py
@@ -0,0 +1,110 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from itertools import chain
+
+from torch.nn.parallel import DataParallel
+
+from .scatter_gather import scatter_kwargs
+
+
+class MMDataParallel(DataParallel):
+    """The DataParallel module that supports DataContainer.
+
+    MMDataParallel has two main differences with PyTorch DataParallel:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data during both GPU and CPU inference.
+    - It implement two more APIs ``train_step()`` and ``val_step()``.
+
+    .. warning::
+        MMDataParallel only supports single GPU training, if you need to
+        train with multiple GPUs, please use MMDistributedDataParallel
+        instead. If you have multiple GPUs and you just want to use
+        MMDataParallel, you can set the environment variable
+        ``CUDA_VISIBLE_DEVICES=0`` or instantiate ``MMDataParallel`` with
+        ``device_ids=[0]``.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        device_ids (list[int]): Device IDS of modules to be scattered to.
+            Defaults to None when GPU is not available.
+        output_device (str | int): Device ID for output. Defaults to None.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs)
+        self.dim = dim
+
+    def forward(self, *inputs, **kwargs):
+        """Override the original forward function.
+
+        The main difference lies in the CPU inference where the data in
+        :class:`DataContainers` will still be gathered.
+        """
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module(*inputs[0], **kwargs[0])
+        else:
+            return super().forward(*inputs, **kwargs)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.train_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.train_step(*inputs[0], **kwargs[0])
+
+    def val_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.val_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.val_step(*inputs[0], **kwargs[0])
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..93111824c59a11ae8924897d78d9aedfb6bc5432
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed.py
@@ -0,0 +1,125 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.nn.parallel.distributed import (DistributedDataParallel,
+                                           _find_tensors)
+
+from mmcv import print_log
+from mmcv.utils import TORCH_VERSION, digit_version
+from .scatter_gather import scatter_kwargs
+
+
+class MMDistributedDataParallel(DistributedDataParallel):
+    """The DDP module that supports DataContainer.
+
+    MMDDP has two main differences with PyTorch DDP:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data.
+    - It implement two APIs ``train_step()`` and ``val_step()``.
+    """
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        """train_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.train_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if getattr(self, 'require_forward_param_sync', True):
+            self._sync_params()
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.train_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.train_step(*inputs, **kwargs)
+
+        if torch.is_grad_enabled() and getattr(
+                self, 'require_backward_grad_sync', True):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """val_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.val_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if getattr(self, 'require_forward_param_sync', True):
+            self._sync_params()
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.val_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.val_step(*inputs, **kwargs)
+
+        if torch.is_grad_enabled() and getattr(
+                self, 'require_backward_grad_sync', True):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed_deprecated.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1e794ecb147dde9bc93534b1800989082787249
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/distributed_deprecated.py
@@ -0,0 +1,83 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import scatter_kwargs
+
+
+@MODULE_WRAPPERS.register_module()
+class MMDistributedDataParallel(nn.Module):
+
+    def __init__(self,
+                 module,
+                 dim=0,
+                 broadcast_buffers=True,
+                 bucket_cap_mb=25):
+        super(MMDistributedDataParallel, self).__init__()
+        self.module = module
+        self.dim = dim
+        self.broadcast_buffers = broadcast_buffers
+
+        self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
+        self._sync_params()
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(
+                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+        if self.broadcast_buffers:
+            if (TORCH_VERSION != 'parrots'
+                    and digit_version(TORCH_VERSION) < digit_version('1.0')):
+                buffers = [b.data for b in self.module._all_buffers()]
+            else:
+                buffers = [b.data for b in self.module.buffers()]
+            if len(buffers) > 0:
+                self._dist_broadcast_coalesced(buffers,
+                                               self.broadcast_bucket_size)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/registry.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f44da6a37158605b280964d41b1e0ab3ed1616
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/registry.py
@@ -0,0 +1,21 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.utils import Registry
+
+MODULE_WRAPPERS = Registry('module wrapper')
+MODULE_WRAPPERS.register_module(module=DataParallel)
+MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/scatter_gather.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..110f6cc358af212511d6317edb2296e11dc21e78
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/scatter_gather.py
@@ -0,0 +1,72 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.nn.parallel._functions import Scatter as OrigScatter
+
+from ._functions import Scatter
+from .data_container import DataContainer
+
+
+def scatter(inputs, target_gpus, dim=0):
+    """Scatter inputs to target gpus.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_gpus != [-1]:
+                return OrigScatter.apply(target_gpus, None, dim, obj)
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_gpus, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_gpus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a142420120a864fd2307c1225f04f642a321a7d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/parallel/utils.py
@@ -0,0 +1,33 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .registry import MODULE_WRAPPERS
+
+
+def is_module_wrapper(module):
+    """Check if a module is a module wrapper.
+
+    The following 3 modules in MMCV (and their subclasses) are regarded as
+    module wrappers: DataParallel, DistributedDataParallel,
+    MMDistributedDataParallel (the deprecated version). You may add you own
+    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: True if the input module is a module wrapper.
+    """
+    module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
+    return isinstance(module, module_wrappers)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02d3cf543a2b48d5fb6ba8bca27a86e9e096497
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/__init__.py
@@ -0,0 +1,77 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
+from .base_runner import BaseRunner
+from .builder import RUNNERS, build_runner
+from .checkpoint import (CheckpointLoader, _load_checkpoint,
+                         _load_checkpoint_with_prefix, load_checkpoint,
+                         load_state_dict, save_checkpoint, weights_to_cpu)
+from .default_constructor import DefaultRunnerConstructor
+from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
+                         init_dist, master_only)
+from .epoch_based_runner import EpochBasedRunner, Runner
+from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
+from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
+                    DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
+                    Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+                    GradientCumulativeOptimizerHook, Hook, IterTimerHook,
+                    LoggerHook, MlflowLoggerHook, NeptuneLoggerHook,
+                    OptimizerHook, PaviLoggerHook, SyncBuffersHook,
+                    TensorboardLoggerHook, TextLoggerHook, WandbLoggerHook)
+from .hooks.lr_updater import StepLrUpdaterHook  # noqa
+from .hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
+                               CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                               ExpLrUpdaterHook, FixedLrUpdaterHook,
+                               FlatCosineAnnealingLrUpdaterHook,
+                               InvLrUpdaterHook, LrUpdaterHook,
+                               OneCycleLrUpdaterHook, PolyLrUpdaterHook)
+from .hooks.momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                                     CyclicMomentumUpdaterHook,
+                                     MomentumUpdaterHook,
+                                     OneCycleMomentumUpdaterHook,
+                                     StepMomentumUpdaterHook)
+from .iter_based_runner import IterBasedRunner, IterLoader
+from .log_buffer import LogBuffer
+from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+                        DefaultOptimizerConstructor, build_optimizer,
+                        build_optimizer_constructor)
+from .priority import Priority, get_priority
+from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
+
+__all__ = [
+    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+    'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
+    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
+    'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
+    'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
+    'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
+    'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
+    'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
+    'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
+    'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
+    'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
+    'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
+    'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
+    '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
+    'ModuleDict', 'ModuleList', 'GradientCumulativeOptimizerHook',
+    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_module.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2b5fabc08fd7abec7f4191b13c42ad1d312958
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_module.py
@@ -0,0 +1,221 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import warnings
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+
+import torch.nn as nn
+
+from mmcv.runner.dist_utils import master_only
+from mmcv.utils.logging import get_logger, logger_initialized, print_log
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab.
+
+    ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
+    functionality of parameter initialization. Compared with
+    ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
+
+    - ``init_cfg``: the config to control the initialization.
+    - ``init_weights``: The function of parameter initialization and recording
+      initialization information.
+    - ``_params_init_info``: Used to track the parameter initialization
+      information. This attribute only exists during executing the
+      ``init_weights``.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg=None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super(BaseModule, self).__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self):
+        return self._is_init
+
+    def init_weights(self):
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        # Get the initialized logger, if not exist,
+        # create a logger named `mmcv`
+        logger_names = list(logger_initialized.keys())
+        logger_name = logger_names[0] if logger_names else 'mmcv'
+
+        from ..cnn import initialize
+        from ..cnn.utils.weight_init import update_init_info
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger=logger_name)
+                initialize(self, self.init_cfg)
+                if isinstance(self.init_cfg, dict):
+                    # prevent the parameters of
+                    # the pre-trained model
+                    # from being overwritten by
+                    # the `init_weights`
+                    if self.init_cfg['type'] == 'Pretrained':
+                        return
+
+            for m in self.children():
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+        if is_top_level_module:
+            self._dump_init_info(logger_name)
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self, logger_name):
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir.
+
+        Args:
+            logger_name (str): The name of logger.
+        """
+
+        logger = get_logger(logger_name)
+
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                print_log(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ",
+                    logger=logger_name)
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, modules=None, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
+
+
+class ModuleDict(BaseModule, nn.ModuleDict):
+    """ModuleDict in openmmlab.
+
+    Args:
+        modules (dict, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module).
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, modules=None, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleDict.__init__(self, modules)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_runner.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f185811c14467c3349d3074fbebbb896f0ac196a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/base_runner.py
@@ -0,0 +1,557 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import logging
+import os.path as osp
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from ..parallel import is_module_wrapper
+from .checkpoint import load_checkpoint
+from .dist_utils import get_dist_info
+from .hooks import HOOKS, Hook
+from .log_buffer import LogBuffer
+from .priority import Priority, get_priority
+from .utils import get_time_str
+
+
+class BaseRunner(metaclass=ABCMeta):
+    """The base class of Runner, a training helper for PyTorch.
+
+    All subclasses should implement the following APIs:
+
+    - ``run()``
+    - ``train()``
+    - ``val()``
+    - ``save_checkpoint()``
+
+    Args:
+        model (:obj:`torch.nn.Module`): The model to be run.
+        batch_processor (callable): A callable method that process a data
+            batch. The interface of this method should be
+            `batch_processor(model, data, train_mode) -> dict`
+        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
+            optimizer (in most cases) or a dict of optimizers (in models that
+            requires more than one optimizer, e.g., GAN).
+        work_dir (str, optional): The working directory to save checkpoints
+            and logs. Defaults to None.
+        logger (:obj:`logging.Logger`): Logger used during training.
+             Defaults to None. (The default value is just for backward
+             compatibility)
+        meta (dict | None): A dict records some import information such as
+            environment info and seed, which will be logged in logger hook.
+            Defaults to None.
+        max_epochs (int, optional): Total training epochs.
+        max_iters (int, optional): Total training iterations.
+    """
+
+    def __init__(self,
+                 model,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 max_iters=None,
+                 max_epochs=None):
+        if batch_processor is not None:
+            if not callable(batch_processor):
+                raise TypeError('batch_processor must be callable, '
+                                f'but got {type(batch_processor)}')
+            warnings.warn(
+                'batch_processor is deprecated, please implement '
+                'train_step() and val_step() in the model instead.',
+                DeprecationWarning)
+            # raise an error is `batch_processor` is not None and
+            # `model.train_step()` exists.
+            if is_module_wrapper(model):
+                _model = model.module
+            else:
+                _model = model
+            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
+                raise RuntimeError(
+                    'batch_processor and model.train_step()/model.val_step() '
+                    'cannot be both available.')
+        else:
+            assert hasattr(model, 'train_step')
+
+        # check the type of `optimizer`
+        if isinstance(optimizer, dict):
+            for name, optim in optimizer.items():
+                if not isinstance(optim, Optimizer):
+                    raise TypeError(
+                        f'optimizer must be a dict of torch.optim.Optimizers, '
+                        f'but optimizer["{name}"] is a {type(optim)}')
+        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
+            raise TypeError(
+                f'optimizer must be a torch.optim.Optimizer object '
+                f'or dict or None, but got {type(optimizer)}')
+
+        # check the type of `logger`
+        if not isinstance(logger, logging.Logger):
+            raise TypeError(f'logger must be a logging.Logger object, '
+                            f'but got {type(logger)}')
+
+        # check the type of `meta`
+        if meta is not None and not isinstance(meta, dict):
+            raise TypeError(
+                f'meta must be a dict or None, but got {type(meta)}')
+
+        self.model = model
+        self.batch_processor = batch_processor
+        self.optimizer = optimizer
+        self.logger = logger
+        self.meta = meta
+        # create work_dir
+        if mmcv.is_str(work_dir):
+            self.work_dir = osp.abspath(work_dir)
+            mmcv.mkdir_or_exist(self.work_dir)
+        elif work_dir is None:
+            self.work_dir = None
+        else:
+            raise TypeError('"work_dir" must be a str or None')
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._rank, self._world_size = get_dist_info()
+        self.timestamp = get_time_str()
+        self.mode = None
+        self._hooks = []
+        self._epoch = 0
+        self._iter = 0
+        self._inner_iter = 0
+
+        if max_epochs is not None and max_iters is not None:
+            raise ValueError(
+                'Only one of `max_epochs` or `max_iters` can be set.')
+
+        self._max_epochs = max_epochs
+        self._max_iters = max_iters
+        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
+        self.log_buffer = LogBuffer()
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def rank(self):
+        """int: Rank of current process. (distributed training)"""
+        return self._rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job.
+        (distributed training)"""
+        return self._world_size
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    @property
+    def inner_iter(self):
+        """int: Iteration in an epoch."""
+        return self._inner_iter
+
+    @property
+    def max_epochs(self):
+        """int: Maximum training epochs."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Maximum training iterations."""
+        return self._max_iters
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def val(self):
+        pass
+
+    @abstractmethod
+    def run(self, data_loaders, workflow, **kwargs):
+        pass
+
+    @abstractmethod
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl,
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        pass
+
+    def current_lr(self):
+        """Get current learning rates.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current learning rates of all
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
+        """
+        if isinstance(self.optimizer, torch.optim.Optimizer):
+            lr = [group['lr'] for group in self.optimizer.param_groups]
+        elif isinstance(self.optimizer, dict):
+            lr = dict()
+            for name, optim in self.optimizer.items():
+                lr[name] = [group['lr'] for group in optim.param_groups]
+        else:
+            raise RuntimeError(
+                'lr is not applicable because optimizer does not exist.')
+        return lr
+
+    def current_momentum(self):
+        """Get current momentums.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current momentums of all
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
+        """
+
+        def _get_momentum(optimizer):
+            momentums = []
+            for group in optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    momentums.append(group['momentum'])
+                elif 'betas' in group.keys():
+                    momentums.append(group['betas'][0])
+                else:
+                    momentums.append(0)
+            return momentums
+
+        if self.optimizer is None:
+            raise RuntimeError(
+                'momentum is not applicable because optimizer does not exist.')
+        elif isinstance(self.optimizer, torch.optim.Optimizer):
+            momentums = _get_momentum(self.optimizer)
+        elif isinstance(self.optimizer, dict):
+            momentums = dict()
+            for name, optim in self.optimizer.items():
+                momentums[name] = _get_momentum(optim)
+        return momentums
+
+    def register_hook(self, hook, priority='NORMAL'):
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Args:
+            hook (:obj:`Hook`): The hook to be registered.
+            priority (int or str or :obj:`Priority`): Hook priority.
+                Lower value means higher priority.
+        """
+        assert isinstance(hook, Hook)
+        if hasattr(hook, 'priority'):
+            raise ValueError('"priority" is a reserved attribute for hooks')
+        priority = get_priority(priority)
+        hook.priority = priority
+        # insert the hook to a sorted list
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if priority >= self._hooks[i].priority:
+                self._hooks.insert(i + 1, hook)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook)
+
+    def register_hook_from_cfg(self, hook_cfg):
+        """Register a hook from its cfg.
+
+        Args:
+            hook_cfg (dict): Hook config. It should have at least keys 'type'
+              and 'priority' indicating its type and priority.
+
+        Note:
+            The specific hook class to register should not use 'type' and
+            'priority' arguments during initialization.
+        """
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
+        self.register_hook(hook, priority=priority)
+
+    def call_hook(self, fn_name):
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+        """
+        for hook in self._hooks:
+            getattr(hook, fn_name)(self)
+
+    def get_hook_info(self):
+        # Get hooks info in each stage
+        stage_hook_map = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name
+            except ValueError:
+                priority = hook.priority
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_checkpoint(self,
+                        filename,
+                        map_location='cpu',
+                        strict=False,
+                        revise_keys=[(r'^module.', '')]):
+        return load_checkpoint(
+            self.model,
+            filename,
+            map_location,
+            strict,
+            self.logger,
+            revise_keys=revise_keys)
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        if map_location == 'default':
+            if torch.cuda.is_available():
+                device_id = torch.cuda.current_device()
+                checkpoint = self.load_checkpoint(
+                    checkpoint,
+                    map_location=lambda storage, loc: storage.cuda(device_id))
+            else:
+                checkpoint = self.load_checkpoint(checkpoint)
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        if self.meta is None:
+            self.meta = {}
+        self.meta.setdefault('hook_msgs', {})
+        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
+        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
+
+        # Re-calculate the number of iterations when resuming
+        # models with different number of GPUs
+        if 'config' in checkpoint['meta']:
+            config = mmcv.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
+                    previous_gpu_ids) != self.world_size:
+                self._iter = int(self._iter * len(previous_gpu_ids) /
+                                 self.world_size)
+                self.logger.info('the iteration number is changed due to '
+                                 'change of GPU number')
+
+        # resume meta information meta
+        self.meta = checkpoint['meta']
+
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
+
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        elif isinstance(lr_config, dict):
+            assert 'policy' in lr_config
+            policy_type = lr_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of Lr updater.
+            # Since this is not applicable for `
+            # CosineAnnealingLrUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(lr_config, HOOKS)
+        else:
+            hook = lr_config
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_momentum_hook(self, momentum_config):
+        if momentum_config is None:
+            return
+        if isinstance(momentum_config, dict):
+            assert 'policy' in momentum_config
+            policy_type = momentum_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of momentum updater.
+            # Since this is not applicable for
+            # `CosineAnnealingMomentumUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'MomentumUpdaterHook'
+            momentum_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(momentum_config, HOOKS)
+        else:
+            hook = momentum_config
+        self.register_hook(hook, priority='HIGH')
+
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = mmcv.build_from_cfg(optimizer_config, HOOKS)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def register_checkpoint_hook(self, checkpoint_config):
+        if checkpoint_config is None:
+            return
+        if isinstance(checkpoint_config, dict):
+            checkpoint_config.setdefault('type', 'CheckpointHook')
+            hook = mmcv.build_from_cfg(checkpoint_config, HOOKS)
+        else:
+            hook = checkpoint_config
+        self.register_hook(hook, priority='NORMAL')
+
+    def register_logger_hooks(self, log_config):
+        if log_config is None:
+            return
+        log_interval = log_config['interval']
+        for info in log_config['hooks']:
+            logger_hook = mmcv.build_from_cfg(
+                info, HOOKS, default_args=dict(interval=log_interval))
+            self.register_hook(logger_hook, priority='VERY_LOW')
+
+    def register_timer_hook(self, timer_config):
+        if timer_config is None:
+            return
+        if isinstance(timer_config, dict):
+            timer_config_ = copy.deepcopy(timer_config)
+            hook = mmcv.build_from_cfg(timer_config_, HOOKS)
+        else:
+            hook = timer_config
+        self.register_hook(hook, priority='LOW')
+
+    def register_custom_hooks(self, custom_config):
+        if custom_config is None:
+            return
+
+        if not isinstance(custom_config, list):
+            custom_config = [custom_config]
+
+        for item in custom_config:
+            if isinstance(item, dict):
+                self.register_hook_from_cfg(item)
+            else:
+                self.register_hook(item, priority='NORMAL')
+
+    def register_profiler_hook(self, profiler_config):
+        if profiler_config is None:
+            return
+        if isinstance(profiler_config, dict):
+            profiler_config.setdefault('type', 'ProfilerHook')
+            hook = mmcv.build_from_cfg(profiler_config, HOOKS)
+        else:
+            hook = profiler_config
+        self.register_hook(hook)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                timer_config=dict(type='IterTimerHook'),
+                                custom_hooks_config=None):
+        """Register default and custom hooks for training.
+
+        Default and custom hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        self.register_lr_hook(lr_config)
+        self.register_momentum_hook(momentum_config)
+        self.register_optimizer_hook(optimizer_config)
+        self.register_checkpoint_hook(checkpoint_config)
+        self.register_timer_hook(timer_config)
+        self.register_logger_hooks(log_config)
+        self.register_custom_hooks(custom_hooks_config)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..509425988a32c6fccc3f42b972aae75db237a676
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/builder.py
@@ -0,0 +1,37 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+from ..utils import Registry
+
+RUNNERS = Registry('runner')
+RUNNER_BUILDERS = Registry('runner builder')
+
+
+def build_runner_constructor(cfg):
+    return RUNNER_BUILDERS.build(cfg)
+
+
+def build_runner(cfg, default_args=None):
+    runner_cfg = copy.deepcopy(cfg)
+    constructor_type = runner_cfg.pop('constructor',
+                                      'DefaultRunnerConstructor')
+    runner_constructor = build_runner_constructor(
+        dict(
+            type=constructor_type,
+            runner_cfg=runner_cfg,
+            default_args=default_args))
+    runner = runner_constructor()
+    return runner
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/checkpoint.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..9254cc25d830d95b1971e943771eb1e0f67f883d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/checkpoint.py
@@ -0,0 +1,733 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import os.path as osp
+import pkgutil
+import re
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+
+import mmcv
+from ..fileio import FileClient
+from ..fileio import load as load_file
+from ..parallel import is_module_wrapper
+from ..utils import load_url, mkdir_or_exist
+from .dist_utils import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        # Some checkpoints converted from 3rd-party repo don't
+        # have the "state_dict" key.
+        state_dict = checkpoint
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+class CheckpointLoader:
+    """A general checkpoint loader to manage all schemes."""
+
+    _schemes = {}
+
+    @classmethod
+    def _register_scheme(cls, prefixes, loader, force=False):
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if (prefix not in cls._schemes) or force:
+                cls._schemes[prefix] = loader
+            else:
+                raise KeyError(
+                    f'{prefix} is already registered as a loader backend, '
+                    'add "force=True" if you want to override it')
+        # sort, longer prefixes take priority
+        cls._schemes = OrderedDict(
+            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
+
+    @classmethod
+    def register_scheme(cls, prefixes, loader=None, force=False):
+        """Register a loader to CheckpointLoader.
+
+        This method can be used as a normal class method or a decorator.
+
+        Args:
+            prefixes (str or list[str] or tuple[str]):
+            The prefix of the registered loader.
+            loader (function, optional): The loader function to be registered.
+                When this method is used as a decorator, loader is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the loader
+                if the prefix has already been registered. Defaults to False.
+        """
+
+        if loader is not None:
+            cls._register_scheme(prefixes, loader, force=force)
+            return
+
+        def _register(loader_cls):
+            cls._register_scheme(prefixes, loader_cls, force=force)
+            return loader_cls
+
+        return _register
+
+    @classmethod
+    def _get_checkpoint_loader(cls, path):
+        """Finds a loader that supports the given path. Falls back to the local
+        loader if no other loader is found.
+
+        Args:
+            path (str): checkpoint path
+
+        Returns:
+            callable: checkpoint loader
+        """
+        for p in cls._schemes:
+            # use regular match to handle some cases that where the prefix of
+            # loader has a prefix. For example, both 's3://path' and
+            # 'open-mmlab:s3://path' should return `load_from_ceph`
+            if re.match(p, path) is not None:
+                return cls._schemes[p]
+
+    @classmethod
+    def load_checkpoint(cls, filename, map_location=None, logger=None):
+        """load checkpoint through URL scheme path.
+
+        Args:
+            filename (str): checkpoint file name with given prefix
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default: None
+            logger (:mod:`logging.Logger`, optional): The logger for message.
+                Default: None
+
+        Returns:
+            dict or OrderedDict: The loaded checkpoint.
+        """
+
+        checkpoint_loader = cls._get_checkpoint_loader(filename)
+        class_name = checkpoint_loader.__name__
+        mmcv.print_log(
+            f'load checkpoint from {class_name[10:]} path: {filename}', logger)
+        return checkpoint_loader(filename, map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes='')
+def load_from_local(filename, map_location):
+    """load checkpoint by local file path.
+
+    Args:
+        filename (str): local checkpoint file path
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    filename = osp.expanduser(filename)
+    if not osp.isfile(filename):
+        raise FileNotFoundError(f'{filename} can not be found.')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
+def load_from_http(filename, map_location=None, model_dir=None):
+    """load checkpoint through HTTP or HTTPS scheme path. In distributed
+    setting, this function only download checkpoint at local rank 0.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        model_dir (string, optional): directory in which to save the object,
+            Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        checkpoint = load_url(
+            filename, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = load_url(
+                filename, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='pavi://')
+def load_from_pavi(filename, map_location=None):
+    """load checkpoint through the file path prefixed with pavi. In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with pavi prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    assert filename.startswith('pavi://'), \
+        f'Expected filename startswith `pavi://`, but get {filename}'
+    model_path = filename[7:]
+
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+
+    model = modelcloud.get(model_path)
+    with TemporaryDirectory() as tmp_dir:
+        downloaded_file = osp.join(tmp_dir, model.name)
+        model.download(downloaded_file)
+        checkpoint = torch.load(downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=r'(\S+\:)?s3://')
+def load_from_ceph(filename, map_location=None, backend='petrel'):
+    """load checkpoint through the file path prefixed with s3.  In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Note:
+        Since v1.4.1, the registered scheme prefixes have been enhanced to
+        support bucket names in the path prefix, e.g. 's3://xx.xx/xx.path',
+        'bucket1:s3://xx.xx/xx.path'.
+
+    Args:
+        filename (str): checkpoint file path with s3 prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        backend (str, optional): The storage backend type. Options are 'ceph',
+            'petrel'. Default: 'petrel'.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    allowed_backends = ['ceph', 'petrel']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+
+    if backend == 'ceph':
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
+
+    # CephClient and PetrelBackend have the same prefix 's3://' and the latter
+    # will be chosen as default. If PetrelBackend can not be instantiated
+    # successfully, the CephClient will be chosen.
+    try:
+        file_client = FileClient(backend=backend)
+    except ImportError:
+        allowed_backends.remove(backend)
+        file_client = FileClient(backend=allowed_backends[0])
+
+    with io.BytesIO(file_client.get(filename)) as buffer:
+        checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
+def load_from_torchvision(filename, map_location=None):
+    """load checkpoint through the file path prefixed with modelzoo or
+    torchvision.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    model_urls = get_torchvision_models()
+    if filename.startswith('modelzoo://'):
+        warnings.warn(
+            'The URL scheme of "modelzoo://" is deprecated, please '
+            'use "torchvision://" instead', DeprecationWarning)
+        model_name = filename[11:]
+    else:
+        model_name = filename[14:]
+    return load_from_http(model_urls[model_name], map_location=map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
+def load_from_openmmlab(filename, map_location=None):
+    """load checkpoint through the file path prefixed with open-mmlab or
+    openmmlab.
+
+    Args:
+        filename (str): checkpoint file path with open-mmlab or
+        openmmlab prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_external_models()
+    prefix_str = 'open-mmlab://'
+    if filename.startswith(prefix_str):
+        model_name = filename[13:]
+    else:
+        model_name = filename[12:]
+        prefix_str = 'openmmlab://'
+
+    deprecated_urls = get_deprecated_model_names()
+    if model_name in deprecated_urls:
+        warnings.warn(
+            f'{prefix_str}{model_name} is deprecated in favor '
+            f'of {prefix_str}{deprecated_urls[model_name]}',
+            DeprecationWarning)
+        model_name = deprecated_urls[model_name]
+    model_url = model_urls[model_name]
+    # check if is url
+    if model_url.startswith(('http://', 'https://')):
+        checkpoint = load_from_http(model_url, map_location=map_location)
+    else:
+        filename = osp.join(_get_mmcv_home(), model_url)
+        if not osp.isfile(filename):
+            raise FileNotFoundError(f'{filename} can not be found.')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='mmcls://')
+def load_from_mmcls(filename, map_location=None):
+    """load checkpoint through the file path prefixed with mmcls.
+
+    Args:
+        filename (str): checkpoint file path with mmcls prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_mmcls_models()
+    model_name = filename[8:]
+    checkpoint = load_from_http(
+        model_urls[model_name], map_location=map_location)
+    checkpoint = _process_mmcls_checkpoint(checkpoint)
+    return checkpoint
+
+
+def _load_checkpoint(filename, map_location=None, logger=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str, optional): Same as :func:`torch.load`.
+           Default: None.
+        logger (:mod:`logging.Logger`, optional): The logger for error message.
+           Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint. It can be either an
+           OrderedDict storing model weights or a dict containing other
+           information, which depends on the checkpoint.
+    """
+    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
+
+
+def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
+    """Load partial pretrained model with specific prefix.
+
+    Args:
+        prefix (str): The prefix of sub-module.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if not prefix.endswith('.'):
+        prefix += '.'
+    prefix_len = len(prefix)
+
+    state_dict = {
+        k[prefix_len:]: v
+        for k, v in state_dict.items() if k.startswith(prefix)
+    }
+
+    assert state_dict, f'{prefix} is not in the pretrained model'
+    return state_dict
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    # Keep metadata in state_dict
+    state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict())
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model,
+                    filename,
+                    optimizer=None,
+                    meta=None,
+                    file_client_args=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        if file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" if filename starts with'
+                f'"pavi://", but got {file_client_args}')
+        try:
+            from pavi import exception, modelcloud
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except exception.NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        with io.BytesIO() as f:
+            torch.save(checkpoint, f)
+            file_client.put(f.getvalue(), filename)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/default_constructor.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c703a56466207e30b35767a17573451a1f3e77c6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/default_constructor.py
@@ -0,0 +1,58 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import RUNNER_BUILDERS, RUNNERS
+
+
+@RUNNER_BUILDERS.register_module()
+class DefaultRunnerConstructor:
+    """Default constructor for runners.
+
+    Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
+    For example, We can inject some new properties and functions for `Runner`.
+
+    Example:
+        >>> from mmcv.runner import RUNNER_BUILDERS, build_runner
+        >>> # Define a new RunnerReconstructor
+        >>> @RUNNER_BUILDERS.register_module()
+        >>> class MyRunnerConstructor:
+        ...     def __init__(self, runner_cfg, default_args=None):
+        ...         if not isinstance(runner_cfg, dict):
+        ...             raise TypeError('runner_cfg should be a dict',
+        ...                             f'but got {type(runner_cfg)}')
+        ...         self.runner_cfg = runner_cfg
+        ...         self.default_args = default_args
+        ...
+        ...     def __call__(self):
+        ...         runner = RUNNERS.build(self.runner_cfg,
+        ...                                default_args=self.default_args)
+        ...         # Add new properties for existing runner
+        ...         runner.my_name = 'my_runner'
+        ...         runner.my_function = lambda self: print(self.my_name)
+        ...         ...
+        >>> # build your runner
+        >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
+        ...                   constructor='MyRunnerConstructor')
+        >>> runner = build_runner(runner_cfg)
+    """
+
+    def __init__(self, runner_cfg, default_args=None):
+        if not isinstance(runner_cfg, dict):
+            raise TypeError('runner_cfg should be a dict',
+                            f'but got {type(runner_cfg)}')
+        self.runner_cfg = runner_cfg
+        self.default_args = default_args
+
+    def __call__(self):
+        return RUNNERS.build(self.runner_cfg, default_args=self.default_args)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/dist_utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bf5a5460fffaae3f04e8bd228a1e553213064ce
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/dist_utils.py
@@ -0,0 +1,177 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import os
+import subprocess
+from collections import OrderedDict
+
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch("hccl", **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.npu.device_count()
+    torch.npu.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+
+
+def get_dist_info():
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def master_only(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce parameters.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters or buffers of a
+            model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    params = [param.data for param in params]
+    if coalesce:
+        _allreduce_coalesced(params, world_size, bucket_size_mb)
+    else:
+        for tensor in params:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/epoch_based_runner.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/epoch_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..08b922d744cc7625e0f664f3fc5c303e34a23f14
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/epoch_based_runner.py
@@ -0,0 +1,201 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+
+import torch
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .utils import get_host_info
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner(BaseRunner):
+    """Epoch-based Runner.
+
+    This runner train models epoch by epoch.
+    """
+
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            outputs = self.batch_processor(
+                self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+            outputs = self.model.train_step(data_batch, self.optimizer,
+                                            **kwargs)
+        else:
+            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._max_iters = self._max_epochs * len(self.data_loader)
+        self.call_hook('before_train_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_train_iter')
+            self.run_iter(data_batch, train_mode=True, **kwargs)
+            self.call_hook('after_train_iter')
+            self._iter += 1
+
+        self.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        self.call_hook('before_val_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_val_iter')
+            self.run_iter(data_batch, train_mode=False)
+            self.call_hook('after_val_iter')
+
+        self.call_hook('after_val_epoch')
+
+    def run(self, data_loaders, workflow, max_epochs=None, **kwargs):
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, epochs) to specify the
+                running order and epochs. E.g, [('train', 2), ('val', 1)] means
+                running 2 epochs for training and 1 epoch for validation,
+                iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_epochs is not None:
+            warnings.warn(
+                'setting max_epochs in run is deprecated, '
+                'please set max_epochs in runner_config', DeprecationWarning)
+            self._max_epochs = max_epochs
+
+        assert self._max_epochs is not None, (
+            'max_epochs must be specified during instantiation')
+
+        for i, flow in enumerate(workflow):
+            mode, epochs = flow
+            if mode == 'train':
+                self._max_iters = self._max_epochs * len(data_loaders[i])
+                break
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d epochs', workflow,
+                         self._max_epochs)
+        self.call_hook('before_run')
+
+        while self.epoch < self._max_epochs:
+            for i, flow in enumerate(workflow):
+                mode, epochs = flow
+                if isinstance(mode, str):  # self.train()
+                    if not hasattr(self, mode):
+                        raise ValueError(
+                            f'runner has no method named "{mode}" to run an '
+                            'epoch')
+                    epoch_runner = getattr(self, mode)
+                else:
+                    raise TypeError(
+                        'mode in workflow must be a str, but got {}'.format(
+                            type(mode)))
+
+                for _ in range(epochs):
+                    if mode == 'train' and self.epoch >= self._max_epochs:
+                        break
+                    epoch_runner(data_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_run')
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='epoch_{}.pth',
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        """Save the checkpoint.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename_tmpl (str, optional): The checkpoint filename template,
+                which contains a placeholder for the epoch number.
+                Defaults to 'epoch_{}.pth'.
+            save_optimizer (bool, optional): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            create_symlink (bool, optional): Whether to create a symlink
+                "latest.pth" to point to the latest checkpoint.
+                Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.epoch + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+
+@RUNNERS.register_module()
+class Runner(EpochBasedRunner):
+    """Deprecated name of EpochBasedRunner."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'Runner was deprecated, please use EpochBasedRunner instead',
+            DeprecationWarning)
+        super().__init__(*args, **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/fp16_utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/fp16_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe121e808d2881df2b2c398563ce75a57b9cb58
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/fp16_utils.py
@@ -0,0 +1,435 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import warnings
+from collections import abc
+from inspect import getfullargspec
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .dist_utils import allreduce_grads as _allreduce_grads
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
+    # manually, so the behavior may not be consistent with real amp.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Note:
+        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
+        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
+        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
+        potential problems. For example,
+        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
+        tensors in inputs to ``torch.half`` including those originally in
+        ``torch.Int`` or other types, which is not expected.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type..
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, nn.Module):
+        return inputs
+    elif isinstance(inputs, torch.Tensor):
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+    else:
+        return inputs
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=True):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
+    torch.cuda.amp is used as the backend, otherwise, original mmcv
+    implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=False):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    warnings.warning(
+        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
+        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads',
+        DeprecationWarning)
+    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    For PyTorch >= 1.6, this function will
+    1. Set fp16 flag inside the model to True.
+
+    Otherwise:
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+    3. Set `fp16_enabled` flag inside the model to True.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.6.0')):
+        # convert model to fp16
+        model.half()
+        # patch the normalization layers to make it work in fp32 mode
+        patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
+        module.float()
+        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
+            module.forward = patch_forward_method(module.forward, torch.half,
+                                                  torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
+
+
+class LossScaler:
+    """Class that manages loss scaling in mixed precision training which
+    supports both dynamic or static mode.
+
+    The implementation refers to
+    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
+    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
+    It's important to understand how :class:`LossScaler` operates.
+    Loss scaling is designed to combat the problem of underflowing
+    gradients encountered at long times when training fp16 networks.
+    Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.
+    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
+    skips the update step for this particular iteration/minibatch,
+    and :class:`LossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients
+    detected,:class:`LossScaler` increases the loss scale once more.
+    In this way :class:`LossScaler` attempts to "ride the edge" of always
+    using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float): Initial loss scale value, default: 2**32.
+        scale_factor (float): Factor used when adjusting the loss scale.
+            Default: 2.
+        mode (str): Loss scaling mode. 'dynamic' or 'static'
+        scale_window (int): Number of consecutive iterations without an
+            overflow to wait before increasing the loss scale. Default: 1000.
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 mode='dynamic',
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        assert mode in ('dynamic',
+                        'static'), 'mode can only be dynamic or static'
+        self.mode = mode
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    def has_overflow(self, params):
+        """Check if params contain overflow."""
+        if self.mode != 'dynamic':
+            return False
+        for p in params:
+            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
+                return True
+        return False
+
+    def _has_inf_or_nan(x):
+        """Check if params contain NaN."""
+        try:
+            cpu_sum = float(x.float().sum())
+        except RuntimeError as instance:
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
+                    or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def update_scale(self, overflow):
+        """update the current loss scale value when overflow happens."""
+        if self.mode != 'dynamic':
+            return
+        if overflow:
+            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % \
+                    self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    def state_dict(self):
+        """Returns the state of the scaler as a :class:`dict`."""
+        return dict(
+            cur_scale=self.cur_scale,
+            cur_iter=self.cur_iter,
+            mode=self.mode,
+            last_overflow_iter=self.last_overflow_iter,
+            scale_factor=self.scale_factor,
+            scale_window=self.scale_window)
+
+    def load_state_dict(self, state_dict):
+        """Loads the loss_scaler state dict.
+
+        Args:
+           state_dict (dict): scaler state.
+        """
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        self.mode = state_dict['mode']
+        self.last_overflow_iter = state_dict['last_overflow_iter']
+        self.scale_factor = state_dict['scale_factor']
+        self.scale_window = state_dict['scale_window']
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..482d944c3369eb93e2088a5f87297235bf3b9f91
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/__init__.py
@@ -0,0 +1,55 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .checkpoint import CheckpointHook
+from .closure import ClosureHook
+from .ema import EMAHook
+from .evaluation import DistEvalHook, EvalHook
+from .hook import HOOKS, Hook
+from .iter_timer import IterTimerHook
+from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook,
+                     NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook,
+                     TextLoggerHook, WandbLoggerHook)
+from .lr_updater import (CosineAnnealingLrUpdaterHook,
+                         CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                         ExpLrUpdaterHook, FixedLrUpdaterHook,
+                         FlatCosineAnnealingLrUpdaterHook, InvLrUpdaterHook,
+                         LrUpdaterHook, OneCycleLrUpdaterHook,
+                         PolyLrUpdaterHook, StepLrUpdaterHook)
+from .memory import EmptyCacheHook
+from .momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                               CyclicMomentumUpdaterHook, MomentumUpdaterHook,
+                               OneCycleMomentumUpdaterHook,
+                               StepMomentumUpdaterHook)
+from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+                        GradientCumulativeOptimizerHook, OptimizerHook)
+from .profiler import ProfilerHook
+from .sampler_seed import DistSamplerSeedHook
+from .sync_buffer import SyncBuffersHook
+
+__all__ = [
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'OptimizerHook',
+    'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook',
+    'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook',
+    'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+    'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook',
+    'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/checkpoint.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1db849bdcfef63b6243056bcac67eef17c35048
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/checkpoint.py
@@ -0,0 +1,180 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import warnings
+
+from mmcv.fileio import FileClient
+from ..dist_utils import allreduce_params, master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Default: -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Default: True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``.
+            `Changed in version 1.3.16.`
+        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Default: -1, which means unlimited.
+        save_last (bool, optional): Whether to force the last checkpoint to be
+            saved regardless of interval. Default: True.
+        sync_buffer (bool, optional): Whether to synchronize buffers in
+            different gpus. Default: False.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+
+    .. warning::
+        Before v1.3.16, the ``out_dir`` argument indicates the path where the
+        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
+        root directory and the final path to save checkpoint is the
+        concatenation of ``out_dir`` and the last level directory of
+        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
+        and the value of ``runner.work_dir`` is "/path/of/B", then the final
+        path will be "/path/of/A/B".
+    """
+
+    def __init__(self,
+                 interval=-1,
+                 by_epoch=True,
+                 save_optimizer=True,
+                 out_dir=None,
+                 max_keep_ckpts=-1,
+                 save_last=True,
+                 sync_buffer=False,
+                 file_client_args=None,
+                 **kwargs):
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.out_dir = out_dir
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+        self.sync_buffer = sync_buffer
+        self.file_client_args = file_client_args
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+
+        runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by '
+                            f'{self.file_client.name}.'))
+
+        # disable the create_symlink option because some file backends do not
+        # allow to create a symlink
+        if 'create_symlink' in self.args:
+            if self.args[
+                    'create_symlink'] and not self.file_client.allow_symlink:
+                self.args['create_symlink'] = False
+                warnings.warn(
+                    ('create_symlink is set as True by the user but is changed'
+                     'to be False because creating symbolic link is not '
+                     f'allowed in {self.file_client.name}'))
+        else:
+            self.args['create_symlink'] = self.file_client.allow_symlink
+
+    def after_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
+
+    @master_only
+    def _save_checkpoint(self, runner):
+        """Save the current checkpoint and delete unwanted checkpoint."""
+        runner.save_checkpoint(
+            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
+        if runner.meta is not None:
+            if self.by_epoch:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
+            else:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
+            runner.meta.setdefault('hook_msgs', dict())
+            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
+                self.out_dir, cur_ckpt_filename)
+        # remove other checkpoints
+        if self.max_keep_ckpts > 0:
+            if self.by_epoch:
+                name = 'epoch_{}.pth'
+                current_ckpt = runner.epoch + 1
+            else:
+                name = 'iter_{}.pth'
+                current_ckpt = runner.iter + 1
+            redundant_ckpts = range(
+                current_ckpt - self.max_keep_ckpts * self.interval, 0,
+                -self.interval)
+            filename_tmpl = self.args.get('filename_tmpl', name)
+            for _step in redundant_ckpts:
+                ckpt_path = self.file_client.join_path(
+                    self.out_dir, filename_tmpl.format(_step))
+                if self.file_client.isfile(ckpt_path):
+                    self.file_client.remove(ckpt_path)
+                else:
+                    break
+
+    def after_train_iter(self, runner):
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        # 2. reach the last iteration of training
+        if self.every_n_iters(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/closure.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/closure.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9f1bdaafd79c3bd07d860affc1252e28bd5ff76
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/closure.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ClosureHook(Hook):
+
+    def __init__(self, fn_name, fn):
+        assert hasattr(self, fn_name)
+        assert callable(fn)
+        setattr(self, fn_name, fn)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/ema.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d7beb4903e96b9eb0c9c8705cb3eb4ac07cbd1d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/ema.py
@@ -0,0 +1,102 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...parallel import is_module_wrapper
+from ..hooks.hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EMAHook(Hook):
+    r"""Exponential Moving Average Hook.
+
+    Use Exponential Moving Average on all parameters of model in training
+    process. All parameters have a ema backup, which update by the formula
+    as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
+
+        .. math::
+
+            Xema\_{t+1} = (1 - \text{momentum}) \times
+            Xema\_{t} +  \text{momentum} \times X_t
+
+    Args:
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+        interval (int): Update ema parameter every interval iteration.
+            Defaults to 1.
+        warm_up (int): During first warm_up steps, we may use smaller momentum
+            to update ema parameters more slowly. Defaults to 100.
+        resume_from (str): The checkpoint path. Defaults to None.
+    """
+
+    def __init__(self,
+                 momentum=0.0002,
+                 interval=1,
+                 warm_up=100,
+                 resume_from=None):
+        assert isinstance(interval, int) and interval > 0
+        self.warm_up = warm_up
+        self.interval = interval
+        assert momentum > 0 and momentum < 1
+        self.momentum = momentum**interval
+        self.checkpoint = resume_from
+
+    def before_run(self, runner):
+        """To resume model with it's ema parameters more friendly.
+
+        Register ema parameter as ``named_buffer`` to model
+        """
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        self.param_ema_buffer = {}
+        self.model_parameters = dict(model.named_parameters(recurse=True))
+        for name, value in self.model_parameters.items():
+            # "." is not allowed in module's buffer name
+            buffer_name = f"ema_{name.replace('.', '_')}"
+            self.param_ema_buffer[name] = buffer_name
+            model.register_buffer(buffer_name, value.data.clone())
+        self.model_buffers = dict(model.named_buffers(recurse=True))
+        if self.checkpoint is not None:
+            runner.resume(self.checkpoint)
+
+    def after_train_iter(self, runner):
+        """Update ema parameter every self.interval iterations."""
+        curr_step = runner.iter
+        # We warm up the momentum considering the instability at beginning
+        momentum = min(self.momentum,
+                       (1 + curr_step) / (self.warm_up + curr_step))
+        if curr_step % self.interval != 0:
+            return
+        for name, parameter in self.model_parameters.items():
+            buffer_name = self.param_ema_buffer[name]
+            buffer_parameter = self.model_buffers[buffer_name]
+            buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data)
+
+    def after_train_epoch(self, runner):
+        """We load parameter values from ema backup to model before the
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def before_train_epoch(self, runner):
+        """We recover model's parameter from ema backup after last epoch's
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def _swap_ema_parameters(self):
+        """Swap the parameter of model with parameter in ema_buffer."""
+        for name, value in self.model_parameters.items():
+            temp = value.data.clone()
+            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
+            value.data.copy_(ema_buffer.data)
+            ema_buffer.data.copy_(temp)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/evaluation.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..553c2b40f9e5f0435971c31cd797db800c13296c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/evaluation.py
@@ -0,0 +1,522 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import warnings
+from math import inf
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmcv.fileio import FileClient
+from mmcv.utils import is_seq_of
+from .hook import Hook
+from .logger import LoggerHook
+
+
+class EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Note:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            from mmcv.engine import single_gpu_test
+            self.test_fn = single_gpu_test
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule, key_indicator):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if the key indicator is equal to the substring in any one item
+           in ``self.greater_keys``, the rule will be specified as 'greater'.
+        4. Or if the key indicator is equal to the substring in any one item
+           in ``self.less_keys``, the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'The best checkpoint will be saved to {self.out_dir} by '
+                 f'{self.file_client.name}'))
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    (f'The previous best checkpoint {self.best_ckpt_path} was '
+                     'removed'))
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir, best_ckpt_name, create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is pool, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class DistEvalHook(EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmcv.engine import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/hook.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..d491773675393aa411a33a8d6a43c7406f784345
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/hook.py
@@ -0,0 +1,105 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.utils import Registry, is_method_overridden
+
+HOOKS = Registry('hook')
+
+
+class Hook:
+    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
+              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_run')
+
+    def before_run(self, runner):
+        pass
+
+    def after_run(self, runner):
+        pass
+
+    def before_epoch(self, runner):
+        pass
+
+    def after_epoch(self, runner):
+        pass
+
+    def before_iter(self, runner):
+        pass
+
+    def after_iter(self, runner):
+        pass
+
+    def before_train_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def before_val_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def after_train_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def after_val_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self.before_iter(runner)
+
+    def before_val_iter(self, runner):
+        self.before_iter(runner)
+
+    def after_train_iter(self, runner):
+        self.after_iter(runner)
+
+    def after_val_iter(self, runner):
+        self.after_iter(runner)
+
+    def every_n_epochs(self, runner, n):
+        return (runner.epoch + 1) % n == 0 if n > 0 else False
+
+    def every_n_inner_iters(self, runner, n):
+        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
+
+    def every_n_iters(self, runner, n):
+        return (runner.iter + 1) % n == 0 if n > 0 else False
+
+    def end_of_epoch(self, runner):
+        return runner.inner_iter + 1 == len(runner.data_loader)
+
+    def is_last_epoch(self, runner):
+        return runner.epoch + 1 == runner._max_epochs
+
+    def is_last_iter(self, runner):
+        return runner.iter + 1 == runner._max_iters
+
+    def get_triggered_stages(self):
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
+            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
+            'before_iter': ['before_train_iter', 'before_val_iter'],
+            'after_iter': ['after_train_iter', 'after_val_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/iter_timer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/iter_timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b830d2a5098f81899c5051d133e74d72d771eb3d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/iter_timer.py
@@ -0,0 +1,31 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+
+    def before_epoch(self, runner):
+        self.t = time.time()
+
+    def before_iter(self, runner):
+        runner.log_buffer.update({'data_time': time.time() - self.t})
+
+    def after_iter(self, runner):
+        runner.log_buffer.update({'time': time.time() - self.t})
+        self.t = time.time()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..390e0bb529ead4219b2ef4b2a6efd929c1bd1f0f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/__init__.py
@@ -0,0 +1,28 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import LoggerHook
+from .dvclive import DvcliveLoggerHook
+from .mlflow import MlflowLoggerHook
+from .neptune import NeptuneLoggerHook
+from .pavi import PaviLoggerHook
+from .tensorboard import TensorboardLoggerHook
+from .text import TextLoggerHook
+from .wandb import WandbLoggerHook
+
+__all__ = [
+    'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
+    'NeptuneLoggerHook', 'DvcliveLoggerHook'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/base.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..afdbc5a880c35e8308b835f72f7948887eae6fb4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/base.py
@@ -0,0 +1,180 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch
+
+from ..hook import Hook
+
+
+class LoggerHook(Hook):
+    """Base class for logger hooks.
+
+    Args:
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.reset_flag = reset_flag
+        self.by_epoch = by_epoch
+
+    @abstractmethod
+    def log(self, runner):
+        pass
+
+    @staticmethod
+    def is_scalar(val, include_np=True, include_torch=True):
+        """Tell the input variable is a scalar or not.
+
+        Args:
+            val: Input variable.
+            include_np (bool): Whether include 0-d np.ndarray as a scalar.
+            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
+
+        Returns:
+            bool: True or False.
+        """
+        if isinstance(val, numbers.Number):
+            return True
+        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
+            return True
+        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
+            return True
+        else:
+            return False
+
+    def get_mode(self, runner):
+        if runner.mode == 'train':
+            if 'time' in runner.log_buffer.output:
+                mode = 'train'
+            else:
+                mode = 'val'
+        elif runner.mode == 'val':
+            mode = 'val'
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return mode
+
+    def get_epoch(self, runner):
+        if runner.mode == 'train':
+            epoch = runner.epoch + 1
+        elif runner.mode == 'val':
+            # normal val mode
+            # runner.epoch += 1 has been done before val workflow
+            epoch = runner.epoch
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return epoch
+
+    def get_iter(self, runner, inner_iter=False):
+        """Get the current training iteration step."""
+        if self.by_epoch and inner_iter:
+            current_iter = runner.inner_iter + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def get_lr_tags(self, runner):
+        tags = {}
+        lrs = runner.current_lr()
+        if isinstance(lrs, dict):
+            for name, value in lrs.items():
+                tags[f'learning_rate/{name}'] = value[0]
+        else:
+            tags['learning_rate'] = lrs[0]
+        return tags
+
+    def get_momentum_tags(self, runner):
+        tags = {}
+        momentums = runner.current_momentum()
+        if isinstance(momentums, dict):
+            for name, value in momentums.items():
+                tags[f'momentum/{name}'] = value[0]
+        else:
+            tags['momentum'] = momentums[0]
+        return tags
+
+    def get_loggable_tags(self,
+                          runner,
+                          allow_scalar=True,
+                          allow_text=False,
+                          add_mode=True,
+                          tags_to_skip=('time', 'data_time')):
+        tags = {}
+        for var, val in runner.log_buffer.output.items():
+            if var in tags_to_skip:
+                continue
+            if self.is_scalar(val) and not allow_scalar:
+                continue
+            if isinstance(val, str) and not allow_text:
+                continue
+            if add_mode:
+                var = f'{self.get_mode(runner)}/{var}'
+            tags[var] = val
+        tags.update(self.get_lr_tags(runner))
+        tags.update(self.get_momentum_tags(runner))
+        return tags
+
+    def before_run(self, runner):
+        for hook in runner.hooks[::-1]:
+            if isinstance(hook, LoggerHook):
+                hook.reset_flag = True
+                break
+
+    def before_epoch(self, runner):
+        runner.log_buffer.clear()  # clear logs of last epoch
+
+    def after_train_iter(self, runner):
+        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif self.end_of_epoch(runner) and not self.ignore_last:
+            # not precise but more stable
+            runner.log_buffer.average(self.interval)
+
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_train_epoch(self, runner):
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_val_epoch(self, runner):
+        runner.log_buffer.average()
+        self.log(runner)
+        if self.reset_flag:
+            runner.log_buffer.clear_output()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/dvclive.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/dvclive.py
new file mode 100644
index 0000000000000000000000000000000000000000..31412e63d90b8a6abb34412f101e2c30001dd8f6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/dvclive.py
@@ -0,0 +1,81 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class DvcliveLoggerHook(LoggerHook):
+    """Class to log metrics with dvclive.
+
+    It requires `dvclive`_ to be installed.
+
+    Args:
+        model_file (str): Default None. If not None, after each epoch the
+            model will be saved to {model_file}.
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        kwargs: Arguments for instantiating `Live`_.
+
+    .. _dvclive:
+        https://dvc.org/doc/dvclive
+
+    .. _Live:
+        https://dvc.org/doc/dvclive/api-reference/live#parameters
+    """
+
+    def __init__(self,
+                 model_file=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True,
+                 **kwargs):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.model_file = model_file
+        self.import_dvclive(**kwargs)
+
+    def import_dvclive(self, **kwargs):
+        try:
+            from dvclive import Live
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install dvclive" to install dvclive')
+        self.dvclive = Live(**kwargs)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            self.dvclive.set_step(self.get_iter(runner))
+            for k, v in tags.items():
+                self.dvclive.log(k, v)
+
+    @master_only
+    def after_train_epoch(self, runner):
+        super().after_train_epoch(runner)
+        if self.model_file is not None:
+            runner.save_checkpoint(
+                Path(self.model_file).parent,
+                filename_tmpl=Path(self.model_file).name,
+                create_symlink=False,
+            )
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/mlflow.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/mlflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..2979052f1fe032f258587950c5bd88929e8e815e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/mlflow.py
@@ -0,0 +1,89 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class MlflowLoggerHook(LoggerHook):
+    """Class to log metrics and (optionally) a trained model to MLflow.
+
+    It requires `MLflow`_ to be installed.
+
+    Args:
+        exp_name (str, optional): Name of the experiment to be used.
+            Default None. If not None, set the active experiment.
+            If experiment does not exist, an experiment with provided name
+            will be created.
+        tags (Dict[str], optional): Tags for the current run.
+            Default None. If not None, set tags for the current run.
+        log_model (bool, optional): Whether to log an MLflow artifact.
+            Default True. If True, log runner.model as an MLflow artifact
+            for the current run.
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _MLflow:
+        https://www.mlflow.org/docs/latest/index.html
+    """
+
+    def __init__(self,
+                 exp_name=None,
+                 tags=None,
+                 log_model=True,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        super(MlflowLoggerHook, self).__init__(interval, ignore_last,
+                                               reset_flag, by_epoch)
+        self.import_mlflow()
+        self.exp_name = exp_name
+        self.tags = tags
+        self.log_model = log_model
+
+    def import_mlflow(self):
+        try:
+            import mlflow
+            import mlflow.pytorch as mlflow_pytorch
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install mlflow" to install mlflow')
+        self.mlflow = mlflow
+        self.mlflow_pytorch = mlflow_pytorch
+
+    @master_only
+    def before_run(self, runner):
+        super(MlflowLoggerHook, self).before_run(runner)
+        if self.exp_name is not None:
+            self.mlflow.set_experiment(self.exp_name)
+        if self.tags is not None:
+            self.mlflow.set_tags(self.tags)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            self.mlflow.log_metrics(tags, step=self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner):
+        if self.log_model:
+            self.mlflow_pytorch.log_model(runner.model, 'models')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/neptune.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/neptune.py
new file mode 100644
index 0000000000000000000000000000000000000000..8301be7cd80ecd5de3ec01b2cb2a11ec6aa4b30f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/neptune.py
@@ -0,0 +1,101 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class NeptuneLoggerHook(LoggerHook):
+    """Class to log metrics to NeptuneAI.
+
+    It requires `Neptune`_ to be installed.
+
+    Args:
+        init_kwargs (dict): a dict contains the initialization keys as below:
+
+            - project (str): Name of a project in a form of
+              namespace/project_name. If None, the value of NEPTUNE_PROJECT
+              environment variable will be taken.
+            - api_token (str): User’s API token. If None, the value of
+              NEPTUNE_API_TOKEN environment variable will be taken. Note: It is
+              strongly recommended to use NEPTUNE_API_TOKEN environment
+              variable rather than placing your API token in plain text in your
+              source code.
+            - name (str, optional, default is 'Untitled'): Editable name of the
+              run. Name is displayed in the run's Details and in Runs table as
+              a column.
+
+            Check https://docs.neptune.ai/api-reference/neptune#init for more
+            init arguments.
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than ``interval``. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _Neptune:
+        https://docs.neptune.ai
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=True,
+                 with_step=True,
+                 by_epoch=True):
+
+        super(NeptuneLoggerHook, self).__init__(interval, ignore_last,
+                                                reset_flag, by_epoch)
+        self.import_neptune()
+        self.init_kwargs = init_kwargs
+        self.with_step = with_step
+
+    def import_neptune(self):
+        try:
+            import neptune.new as neptune
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install neptune-client" to install neptune')
+        self.neptune = neptune
+        self.run = None
+
+    @master_only
+    def before_run(self, runner):
+        if self.init_kwargs:
+            self.run = self.neptune.init(**self.init_kwargs)
+        else:
+            self.run = self.neptune.init()
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            for tag_name, tag_value in tags.items():
+                if self.with_step:
+                    self.run[tag_name].log(
+                        tag_value, step=self.get_iter(runner))
+                else:
+                    tags['global_step'] = self.get_iter(runner)
+                    self.run[tag_name].log(tags)
+
+    @master_only
+    def after_run(self, runner):
+        self.run.stop()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/pavi.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/pavi.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92b8d8a825708ab49729d936138ce9aee39b0e3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/pavi.py
@@ -0,0 +1,145 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import os.path as osp
+
+import torch
+import yaml
+
+import mmcv
+from ....parallel.utils import is_module_wrapper
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class PaviLoggerHook(LoggerHook):
+    """Class to visual model, log metrics (for internal use).
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys.
+        add_graph (bool): Whether to visual model. Default: False.
+        add_last_ckpt (bool): Whether to save checkpoint after run.
+            Default: False.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        img_key (string): Get image data from Dataset. Default: 'img_info'.
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 add_graph=False,
+                 add_last_ckpt=False,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True,
+                 img_key='img_info'):
+        super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag,
+                                             by_epoch)
+        self.init_kwargs = init_kwargs
+        self.add_graph = add_graph
+        self.add_last_ckpt = add_last_ckpt
+        self.img_key = img_key
+
+    @master_only
+    def before_run(self, runner):
+        super(PaviLoggerHook, self).before_run(runner)
+        try:
+            from pavi import SummaryWriter
+        except ImportError:
+            raise ImportError('Please run "pip install pavi" to install pavi.')
+
+        self.run_name = runner.work_dir.split('/')[-1]
+
+        if not self.init_kwargs:
+            self.init_kwargs = dict()
+        self.init_kwargs['name'] = self.run_name
+        self.init_kwargs['model'] = runner._model_name
+        if runner.meta is not None:
+            if 'config_dict' in runner.meta:
+                config_dict = runner.meta['config_dict']
+                assert isinstance(
+                    config_dict,
+                    dict), ('meta["config_dict"] has to be of a dict, '
+                            f'but got {type(config_dict)}')
+            elif 'config_file' in runner.meta:
+                config_file = runner.meta['config_file']
+                config_dict = dict(mmcv.Config.fromfile(config_file))
+            else:
+                config_dict = None
+            if config_dict is not None:
+                # 'max_.*iter' is parsed in pavi sdk as the maximum iterations
+                #  to properly set up the progress bar.
+                config_dict = config_dict.copy()
+                config_dict.setdefault('max_iter', runner.max_iters)
+                # non-serializable values are first converted in
+                # mmcv.dump to json
+                config_dict = json.loads(
+                    mmcv.dump(config_dict, file_format='json'))
+                session_text = yaml.dump(config_dict)
+                self.init_kwargs['session_text'] = session_text
+        self.writer = SummaryWriter(**self.init_kwargs)
+
+    def get_step(self, runner):
+        """Get the total training step/epoch."""
+        if self.get_mode(runner) == 'val' and self.by_epoch:
+            return self.get_epoch(runner)
+        else:
+            return self.get_iter(runner)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner, add_mode=False)
+        if tags:
+            self.writer.add_scalars(
+                self.get_mode(runner), tags, self.get_step(runner))
+
+    @master_only
+    def after_run(self, runner):
+        if self.add_last_ckpt:
+            ckpt_path = osp.join(runner.work_dir, 'latest.pth')
+            if osp.islink(ckpt_path):
+                ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
+
+            if osp.isfile(ckpt_path):
+                # runner.epoch += 1 has been done before `after_run`.
+                iteration = runner.epoch if self.by_epoch else runner.iter
+                return self.writer.add_snapshot_file(
+                    tag=self.run_name,
+                    snapshot_file_path=ckpt_path,
+                    iteration=iteration)
+
+        # flush the buffer and send a task ending signal to Pavi
+        self.writer.close()
+
+    @master_only
+    def before_epoch(self, runner):
+        if runner.epoch == 0 and self.add_graph:
+            if is_module_wrapper(runner.model):
+                _model = runner.model.module
+            else:
+                _model = runner.model
+            device = next(_model.parameters()).device
+            data = next(iter(runner.data_loader))
+            image = data[self.img_key][0:1].to(device)
+            with torch.no_grad():
+                self.writer.add_graph(_model, image)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/tensorboard.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f936dd477ae9822763702e982f3dd58520ce59
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/tensorboard.py
@@ -0,0 +1,82 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TensorboardLoggerHook(LoggerHook):
+    """Class to log metrics to Tensorboard.
+
+    Args:
+        log_dir (string): Save directory location. Default: None. If default
+            values are used, directory location is ``runner.work_dir``/tf_logs.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+    """
+
+    def __init__(self,
+                 log_dir=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        super(TensorboardLoggerHook, self).__init__(interval, ignore_last,
+                                                    reset_flag, by_epoch)
+        self.log_dir = log_dir
+
+    @master_only
+    def before_run(self, runner):
+        super(TensorboardLoggerHook, self).before_run(runner)
+        if (TORCH_VERSION == 'parrots'
+                or digit_version(TORCH_VERSION) < digit_version('1.1')):
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+
+        if self.log_dir is None:
+            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
+        self.writer = SummaryWriter(self.log_dir)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner, allow_text=True)
+        for tag, val in tags.items():
+            if isinstance(val, str):
+                self.writer.add_text(tag, val, self.get_iter(runner))
+            else:
+                self.writer.add_scalar(tag, val, self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner):
+        self.writer.close()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/text.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab873afdda29089c6caea9311c4731efa2d64384
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/text.py
@@ -0,0 +1,269 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import os.path as osp
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+
+import mmcv
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import is_tuple_of, scandir
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TextLoggerHook(LoggerHook):
+    """Logger hook in text.
+
+    In this logger hook, the information will be printed on terminal and
+    saved in json file.
+
+    Args:
+        by_epoch (bool, optional): Whether EpochBasedRunner is used.
+            Default: True.
+        interval (int, optional): Logging interval (every k iterations).
+            Default: 10.
+        ignore_last (bool, optional): Ignore the log of last iterations in each
+            epoch if less than :attr:`interval`. Default: True.
+        reset_flag (bool, optional): Whether to clear the output buffer after
+            logging. Default: False.
+        interval_exp_name (int, optional): Logging interval for experiment
+            name. This feature is to help users conveniently get the experiment
+            information from screen or log file. Default: 1000.
+        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
+            If ``out_dir`` is specified, logs will be copied to a new directory
+            which is the concatenation of ``out_dir`` and the last level
+            directory of ``runner.work_dir``. Default: None.
+            `New in version 1.3.16.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be copied to ``out_dir``.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.3.16.`
+        keep_local (bool, optional): Whether to keep local log when
+            :attr:`out_dir` is specified. If False, the local log will be
+            removed. Default: True.
+            `New in version 1.3.16.`
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 interval_exp_name=1000,
+                 out_dir=None,
+                 out_suffix=('.log.json', '.log', '.py'),
+                 keep_local=True,
+                 file_client_args=None):
+        super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
+                                             by_epoch)
+        self.by_epoch = by_epoch
+        self.time_sec_tot = 0
+        self.interval_exp_name = interval_exp_name
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+        self.out_dir = out_dir
+
+        if not (out_dir is None or isinstance(out_dir, str)
+                or is_tuple_of(out_dir, str)):
+            raise TypeError('out_dir should be  "None" or string or tuple of '
+                            'string, but got {out_dir}')
+        self.out_suffix = out_suffix
+
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+
+    def before_run(self, runner):
+        super(TextLoggerHook, self).before_run(runner)
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(self.file_client_args,
+                                                       self.out_dir)
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'Text logs will be saved to {self.out_dir} by '
+                 f'{self.file_client.name} after the training process.'))
+
+        self.start_iter = runner.iter
+        self.json_log_path = osp.join(runner.work_dir,
+                                      f'{runner.timestamp}.log.json')
+        if runner.meta is not None:
+            self._dump_log(runner.meta, runner)
+
+    def _get_max_memory(self, runner):
+        device = getattr(runner.model, 'output_device', None)
+        mem = torch.cuda.max_memory_allocated(device=device)
+        mem_mb = torch.tensor([mem / (1024 * 1024)],
+                              dtype=torch.int,
+                              device=device)
+        if runner.world_size > 1:
+            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
+        return mem_mb.item()
+
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+
+        if log_dict['mode'] == 'train':
+            if isinstance(log_dict['lr'], dict):
+                lr_str = []
+                for k, val in log_dict['lr'].items():
+                    lr_str.append(f'lr_{k}: {val:.3e}')
+                lr_str = ' '.join(lr_str)
+            else:
+                lr_str = f'lr: {log_dict["lr"]:.3e}'
+
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str}, '
+
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+
+        runner.logger.info(log_str)
+
+    def _dump_log(self, log_dict, runner):
+        # dump log in json format
+        json_log = OrderedDict()
+        for k, v in log_dict.items():
+            json_log[k] = self._round_float(v)
+        # only append log at last line
+        if runner.rank == 0:
+            with open(self.json_log_path, 'a+') as f:
+                mmcv.dump(json_log, f, file_format='json')
+                f.write('\n')
+
+    def _round_float(self, items):
+        if isinstance(items, list):
+            return [self._round_float(item) for item in items]
+        elif isinstance(items, float):
+            return round(items, 5)
+        else:
+            return items
+
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+
+        # only record lr of the first param group
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['lr'] = cur_lr[0]
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'] = {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['lr'].update({k: lr_[0]})
+
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict
+
+    def after_run(self, runner):
+        # copy or upload logs to self.out_dir
+        if self.out_dir is not None:
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                out_filepath = self.file_client.join_path(
+                    self.out_dir, filename)
+                with open(local_filepath, 'r') as f:
+                    self.file_client.put_text(f.read(), out_filepath)
+
+                runner.logger.info(
+                    (f'The file {local_filepath} has been uploaded to '
+                     f'{out_filepath}.'))
+
+                if not self.keep_local:
+                    os.remove(local_filepath)
+                    runner.logger.info(
+                        (f'{local_filepath} was removed due to the '
+                         '`self.keep_local=False`'))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/wandb.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/wandb.py
new file mode 100644
index 0000000000000000000000000000000000000000..727bff9ea0eb6faceece4fd385198317af9d849e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/logger/wandb.py
@@ -0,0 +1,120 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+from mmcv.utils import scandir
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class WandbLoggerHook(LoggerHook):
+    """Class to log metrics with wandb.
+
+    It requires `wandb`_ to be installed.
+
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys. Check
+            https://docs.wandb.ai/ref/python/init for more init arguments.
+        interval (int): Logging interval (every k iterations).
+            Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+            Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        commit (bool): Save the metrics dict to the wandb server and increment
+            the step. If false ``wandb.log`` just updates the current metrics
+            dict with the row argument and metrics won't be saved until
+            ``wandb.log`` is called with ``commit=True``.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        log_artifact (bool): If True, artifacts in {work_dir} will be uploaded
+            to wandb after training ends.
+            Default: True
+            `New in version 1.4.3.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be uploaded to wandb.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.4.3.`
+
+    .. _wandb:
+        https://docs.wandb.ai
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 commit=True,
+                 by_epoch=True,
+                 with_step=True,
+                 log_artifact=True,
+                 out_suffix=('.log.json', '.log', '.py')):
+        super(WandbLoggerHook, self).__init__(interval, ignore_last,
+                                              reset_flag, by_epoch)
+        self.import_wandb()
+        self.init_kwargs = init_kwargs
+        self.commit = commit
+        self.with_step = with_step
+        self.log_artifact = log_artifact
+        self.out_suffix = out_suffix
+
+    def import_wandb(self):
+        try:
+            import wandb
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install wandb" to install wandb')
+        self.wandb = wandb
+
+    @master_only
+    def before_run(self, runner):
+        super(WandbLoggerHook, self).before_run(runner)
+        if self.wandb is None:
+            self.import_wandb()
+        if self.init_kwargs:
+            self.wandb.init(**self.init_kwargs)
+        else:
+            self.wandb.init()
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            if self.with_step:
+                self.wandb.log(
+                    tags, step=self.get_iter(runner), commit=self.commit)
+            else:
+                tags['global_step'] = self.get_iter(runner)
+                self.wandb.log(tags, commit=self.commit)
+
+    @master_only
+    def after_run(self, runner):
+        if self.log_artifact:
+            wandb_artifact = self.wandb.Artifact(
+                name='artifacts', type='model')
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                wandb_artifact.add_file(local_filepath)
+            self.wandb.log_artifact(wandb_artifact)
+        self.wandb.join()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/lr_updater.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/lr_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef6355ea9840fd36768e7eaaa28ad7e5f4eab016
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/lr_updater.py
@@ -0,0 +1,703 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+from math import cos, pi
+
+import mmcv
+from .hook import HOOKS, Hook
+
+
+class LrUpdaterHook(Hook):
+    """LR Scheduler in MMCV.
+
+    Args:
+        by_epoch (bool): LR changes epoch by epoch
+        warmup (string): Type of warmup used. It can be None(use no warmup),
+            'constant', 'linear' or 'exp'
+        warmup_iters (int): The number of iterations or epochs that warmup
+            lasts
+        warmup_ratio (float): LR used at the beginning of warmup equals to
+            warmup_ratio * initial_lr
+        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
+            means the number of epochs that warmup lasts, otherwise means the
+            number of iteration that warmup lasts
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 warmup=None,
+                 warmup_iters=0,
+                 warmup_ratio=0.1,
+                 warmup_by_epoch=False):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_ratio" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+        self.warmup_by_epoch = warmup_by_epoch
+
+        if self.warmup_by_epoch:
+            self.warmup_epochs = self.warmup_iters
+            self.warmup_iters = None
+        else:
+            self.warmup_epochs = None
+
+        self.base_lr = []  # initial lr for all param groups
+        self.regular_lr = []  # expected lr if no warming up is performed
+
+    def _set_lr(self, runner, lr_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
+                    param_group['lr'] = lr
+        else:
+            for param_group, lr in zip(runner.optimizer.param_groups,
+                                       lr_groups):
+                param_group['lr'] = lr
+
+    def get_lr(self, runner, base_lr):
+        raise NotImplementedError
+
+    def get_regular_lr(self, runner):
+        if isinstance(runner.optimizer, dict):
+            lr_groups = {}
+            for k in runner.optimizer.keys():
+                _lr_group = [
+                    self.get_lr(runner, _base_lr)
+                    for _base_lr in self.base_lr[k]
+                ]
+                lr_groups.update({k: _lr_group})
+
+            return lr_groups
+        else:
+            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
+
+    def get_warmup_lr(self, cur_iters):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            if self.warmup == 'constant':
+                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.regular_lr, dict):
+            lr_groups = {}
+            for key, regular_lr in self.regular_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.regular_lr)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    group.setdefault('initial_lr', group['lr'])
+                _base_lr = [
+                    group['initial_lr'] for group in optim.param_groups
+                ]
+                self.base_lr.update({k: _base_lr})
+        else:
+            for group in runner.optimizer.param_groups:
+                group.setdefault('initial_lr', group['lr'])
+            self.base_lr = [
+                group['initial_lr'] for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if self.warmup_iters is None:
+            epoch_len = len(runner.data_loader)
+            self.warmup_iters = self.warmup_epochs * epoch_len
+
+        if not self.by_epoch:
+            return
+
+        self.regular_lr = self.get_regular_lr(runner)
+        self._set_lr(runner, self.regular_lr)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_lr = self.get_regular_lr(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+
+
+@HOOKS.register_module()
+class FixedLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, **kwargs):
+        super(FixedLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        return base_lr
+
+
+@HOOKS.register_module()
+class StepLrUpdaterHook(LrUpdaterHook):
+    """Step LR scheduler with min_lr clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the LR. If an int value is given,
+            regard it as the decay interval. If a list is given, decay LR at
+            these steps.
+        gamma (float, optional): Decay LR ratio. Default: 0.1.
+        min_lr (float, optional): Minimum LR value to keep. If LR after decay
+            is lower than `min_lr`, it will be clipped to this value. If None
+            is given, we don't perform lr clipping. Default: None.
+    """
+
+    def __init__(self, step, gamma=0.1, min_lr=None, **kwargs):
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_lr = min_lr
+        super(StepLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        lr = base_lr * (self.gamma**exp)
+        if self.min_lr is not None:
+            # clip to a minimum value
+            lr = max(lr, self.min_lr)
+        return lr
+
+
+@HOOKS.register_module()
+class ExpLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, **kwargs):
+        self.gamma = gamma
+        super(ExpLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * self.gamma**progress
+
+
+@HOOKS.register_module()
+class PolyLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, power=1., min_lr=0., **kwargs):
+        self.power = power
+        self.min_lr = min_lr
+        super(PolyLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        coeff = (1 - progress / max_progress)**self.power
+        return (base_lr - self.min_lr) * coeff + self.min_lr
+
+
+@HOOKS.register_module()
+class InvLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, power=1., **kwargs):
+        self.gamma = gamma
+        self.power = power
+        super(InvLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * (1 + self.gamma * progress)**(-self.power)
+
+
+@HOOKS.register_module()
+class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+        return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Flat + Cosine lr schedule.
+
+    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
+
+    Args:
+        start_percent (float): When to start annealing the learning rate
+            after the percentage of the total training steps.
+            The value should be in range [0, 1).
+            Default: 0.75
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 start_percent=0.75,
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        if start_percent < 0 or start_percent > 1 or not isinstance(
+                start_percent, float):
+            raise ValueError(
+                'expected float between 0 and 1 start_percent, but '
+                f'got {start_percent}')
+        self.start_percent = start_percent
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            start = round(runner.max_epochs * self.start_percent)
+            progress = runner.epoch - start
+            max_progress = runner.max_epochs - start
+        else:
+            start = round(runner.max_iters * self.start_percent)
+            progress = runner.iter - start
+            max_progress = runner.max_iters - start
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        if progress < 0:
+            return base_lr
+        else:
+            return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class CosineRestartLrUpdaterHook(LrUpdaterHook):
+    """Cosine annealing with restarts learning rate scheme.
+
+    Args:
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float], optional): Restart weights at each
+            restart iteration. Default: [1].
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 periods,
+                 restart_weights=[1],
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.periods = periods
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        super(CosineRestartLrUpdaterHook, self).__init__(**kwargs)
+
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+        else:
+            progress = runner.iter
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        idx = get_position_from_periods(progress, self.cumulative_periods)
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+
+        alpha = min((progress - nearest_restart) / current_periods, 1)
+        return annealing_cos(base_lr, target_lr, alpha, current_weight)
+
+
+def get_position_from_periods(iteration, cumulative_periods):
+    """Get the position from a period list.
+
+    It will return the index of the right-closest number in the period list.
+    For example, the cumulative_periods = [100, 200, 300, 400],
+    if iteration == 50, return 0;
+    if iteration == 210, return 2;
+    if iteration == 300, return 3.
+
+    Args:
+        iteration (int): Current iteration.
+        cumulative_periods (list[int]): Cumulative period list.
+
+    Returns:
+        int: The position of the right-closest number in the period list.
+    """
+    for i, period in enumerate(cumulative_periods):
+        if iteration < period:
+            return i
+    raise ValueError(f'Current iteration {iteration} exceeds '
+                     f'cumulative_periods {cumulative_periods}')
+
+
+@HOOKS.register_module()
+class CyclicLrUpdaterHook(LrUpdaterHook):
+    """Cyclic LR Scheduler.
+
+    Implement the cyclical learning rate policy (CLR) described in
+    https://arxiv.org/pdf/1506.01186.pdf
+
+    Different from the original paper, we use cosine annealing rather than
+    triangular policy inside a cycle. This improves the performance in the
+    3D detection area.
+
+    Args:
+        by_epoch (bool, optional): Whether to update LR by epoch.
+        target_ratio (tuple[float], optional): Relative ratio of the highest LR
+            and the lowest LR to the initial LR.
+        cyclic_times (int, optional): Number of cycles during training
+        step_ratio_up (float, optional): The ratio of the increasing process of
+            LR in the total cycle.
+        anneal_strategy (str, optional): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
+    """
+
+    def __init__(self,
+                 by_epoch=False,
+                 target_ratio=(10, 1e-4),
+                 cyclic_times=1,
+                 step_ratio_up=0.4,
+                 anneal_strategy='cos',
+                 gamma=1,
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+        assert 0 < gamma <= 1, \
+            '"gamma" must be in range (0, 1]'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.gamma = gamma
+        self.max_iter_per_phase = None
+        self.lr_phases = []  # init lr_phases
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super(CyclicLrUpdaterHook, self).before_run(runner)
+        # initiate lr_phases
+        # total lr_phases are separated as up and down
+        self.max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * self.max_iter_per_phase)
+        self.lr_phases.append([0, iter_up_phase, 1, self.target_ratio[0]])
+        self.lr_phases.append([
+            iter_up_phase, self.max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
+        ])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter % self.max_iter_per_phase
+        curr_cycle = runner.iter // self.max_iter_per_phase
+        # Update weight decay
+        scale = self.gamma**curr_cycle
+
+        for (start_iter, end_iter, start_ratio, end_ratio) in self.lr_phases:
+            if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_lr and base lr. The target end_ratio can be
+                # expressed as:
+                # end_ratio = (base_lr + scale * (max_lr - base_lr)) / base_lr
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_lr * start_ratio,
+                                        base_lr * end_ratio,
+                                        progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleLrUpdaterHook(LrUpdaterHook):
+    """One Cycle LR Scheduler.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. The one cycle learning rate policy is described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    Args:
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group.
+        total_steps (int, optional): The total number of steps in the cycle.
+            Note that if a value is not provided here, it will be the max_iter
+            of runner. Default: None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor
+            Default: 25
+        final_div_factor (float): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor
+            Default: 1e4
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 max_lr,
+                 total_steps=None,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 div_factor=25,
+                 final_div_factor=1e4,
+                 three_phase=False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch = False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(max_lr, (numbers.Number, list, dict)):
+            raise ValueError('the type of max_lr must be the one of list or '
+                             f'dict, but got {type(max_lr)}')
+        self._max_lr = max_lr
+        if total_steps is not None:
+            if not isinstance(total_steps, int):
+                raise ValueError('the type of total_steps must be int, but'
+                                 f'got {type(total_steps)}')
+            self.total_steps = total_steps
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+        self.three_phase = three_phase
+        self.lr_phases = []  # init lr_phases
+        super(OneCycleLrUpdaterHook, self).__init__(**kwargs)
+
+    def before_run(self, runner):
+        if hasattr(self, 'total_steps'):
+            total_steps = self.total_steps
+        else:
+            total_steps = runner.max_iters
+        if total_steps < runner.max_iters:
+            raise ValueError(
+                'The total steps must be greater than or equal to max '
+                f'iterations {runner.max_iters} of runner, but total steps '
+                f'is {total_steps}.')
+
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                _max_lr = format_param(k, optim, self._max_lr)
+                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
+                for group, lr in zip(optim.param_groups, self.base_lr[k]):
+                    group.setdefault('initial_lr', lr)
+        else:
+            k = type(runner.optimizer).__name__
+            _max_lr = format_param(k, runner.optimizer, self._max_lr)
+            self.base_lr = [lr / self.div_factor for lr in _max_lr]
+            for group, lr in zip(runner.optimizer.param_groups, self.base_lr):
+                group.setdefault('initial_lr', lr)
+
+        if self.three_phase:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append([
+                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
+            ])
+            self.lr_phases.append(
+                [total_steps - 1, 1, 1 / self.final_div_factor])
+        else:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append(
+                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
+            if curr_iter <= end_iter:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
+                                      pct)
+                break
+            start_iter = end_iter
+        return lr
+
+
+def annealing_cos(start, end, factor, weight=1):
+    """Calculate annealing cos learning rate.
+
+    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
+    percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the cosine annealing.
+        end (float): The ending learing rate of the cosine annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+        weight (float, optional): The combination factor of `start` and `end`
+            when calculating the actual starting learning rate. Default to 1.
+    """
+    cos_out = cos(pi * factor) + 1
+    return end + 0.5 * weight * (start - end) * cos_out
+
+
+def annealing_linear(start, end, factor):
+    """Calculate annealing linear learning rate.
+
+    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the linear annealing.
+        end (float): The ending learing rate of the linear annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+    """
+    return start + (end - start) * factor
+
+
+def format_param(name, optim, param):
+    if isinstance(param, numbers.Number):
+        return [param] * len(optim.param_groups)
+    elif isinstance(param, (list, tuple)):  # multi param groups
+        if len(param) != len(optim.param_groups):
+            raise ValueError(f'expected {len(optim.param_groups)} '
+                             f'values for {name}, got {len(param)}')
+        return param
+    else:  # multi optimizers
+        if name not in param:
+            raise KeyError(f'{name} is not found in {param.keys()}')
+        return param[name]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/memory.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81f77229ac71c3212351b5ab13971df2365a498
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/memory.py
@@ -0,0 +1,38 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EmptyCacheHook(Hook):
+
+    def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
+        self._before_epoch = before_epoch
+        self._after_epoch = after_epoch
+        self._after_iter = after_iter
+
+    def after_iter(self, runner):
+        if self._after_iter:
+            torch.cuda.empty_cache()
+
+    def before_epoch(self, runner):
+        if self._before_epoch:
+            torch.cuda.empty_cache()
+
+    def after_epoch(self, runner):
+        if self._after_epoch:
+            torch.cuda.empty_cache()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/momentum_updater.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/momentum_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a176fc68728540392fa481ebbfaa81f4231b70
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/momentum_updater.py
@@ -0,0 +1,537 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mmcv
+from .hook import HOOKS, Hook
+from .lr_updater import annealing_cos, annealing_linear, format_param
+
+
+class MomentumUpdaterHook(Hook):
+
+    def __init__(self,
+                 by_epoch=True,
+                 warmup=None,
+                 warmup_iters=0,
+                 warmup_ratio=0.9):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_momentum" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+
+        self.base_momentum = []  # initial momentum for all param groups
+        self.regular_momentum = [
+        ]  # expected momentum if no warming up is performed
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, base_momentum):
+        raise NotImplementedError
+
+    def get_regular_momentum(self, runner):
+        if isinstance(runner.optimizer, dict):
+            momentum_groups = {}
+            for k in runner.optimizer.keys():
+                _momentum_group = [
+                    self.get_momentum(runner, _base_momentum)
+                    for _base_momentum in self.base_momentum[k]
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            return [
+                self.get_momentum(runner, _base_momentum)
+                for _base_momentum in self.base_momentum
+            ]
+
+    def get_warmup_momentum(self, cur_iters):
+
+        def _get_warmup_momentum(cur_iters, regular_momentum):
+            if self.warmup == 'constant':
+                warmup_momentum = [
+                    _momentum / self.warmup_ratio
+                    for _momentum in regular_momentum
+                ]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_momentum = [
+                    _momentum / (1 - k) for _momentum in regular_momentum
+                ]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_momentum = [
+                    _momentum / k for _momentum in regular_momentum
+                ]
+            return warmup_momentum
+
+        if isinstance(self.regular_momentum, dict):
+            momentum_groups = {}
+            for key, regular_momentum in self.regular_momentum.items():
+                momentum_groups[key] = _get_warmup_momentum(
+                    cur_iters, regular_momentum)
+            return momentum_groups
+        else:
+            return _get_warmup_momentum(cur_iters, self.regular_momentum)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint,
+        # if 'initial_momentum' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_momentum = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    if 'momentum' in group.keys():
+                        group.setdefault('initial_momentum', group['momentum'])
+                    else:
+                        group.setdefault('initial_momentum', group['betas'][0])
+                _base_momentum = [
+                    group['initial_momentum'] for group in optim.param_groups
+                ]
+                self.base_momentum.update({k: _base_momentum})
+        else:
+            for group in runner.optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    group.setdefault('initial_momentum', group['momentum'])
+                else:
+                    group.setdefault('initial_momentum', group['betas'][0])
+            self.base_momentum = [
+                group['initial_momentum']
+                for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+        self.regular_momentum = self.get_regular_momentum(runner)
+        self._set_momentum(runner, self.regular_momentum)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_momentum = self.get_regular_momentum(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_momentum(runner, self.regular_momentum)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_momentum(runner, self.regular_momentum)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+
+
+@HOOKS.register_module()
+class StepMomentumUpdaterHook(MomentumUpdaterHook):
+    """Step momentum scheduler with min value clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the momentum. If an int value is
+            given, regard it as the decay interval. If a list is given, decay
+            momentum at these steps.
+        gamma (float, optional): Decay momentum ratio. Default: 0.5.
+        min_momentum (float, optional): Minimum momentum value to keep. If
+            momentum after decay is lower than this value, it will be clipped
+            accordingly. If None is given, we don't perform lr clipping.
+            Default: None.
+    """
+
+    def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs):
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_momentum = min_momentum
+        super(StepMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        momentum = base_momentum * (self.gamma**exp)
+        if self.min_momentum is not None:
+            # clip to a minimum value
+            momentum = max(momentum, self.min_momentum)
+        return momentum
+
+
+@HOOKS.register_module()
+class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+
+    def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs):
+        assert (min_momentum is None) ^ (min_momentum_ratio is None)
+        self.min_momentum = min_momentum
+        self.min_momentum_ratio = min_momentum_ratio
+        super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_momentum_ratio is not None:
+            target_momentum = base_momentum * self.min_momentum_ratio
+        else:
+            target_momentum = self.min_momentum
+        return annealing_cos(base_momentum, target_momentum,
+                             progress / max_progress)
+
+
+@HOOKS.register_module()
+class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
+    """Cyclic momentum Scheduler.
+
+    Implement the cyclical momentum scheduler policy described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    This momentum scheduler usually used together with the CyclicLRUpdater
+    to improve the performance in the 3D detection area.
+
+    Args:
+        target_ratio (tuple[float]): Relative ratio of the lowest momentum and
+            the highest momentum to the initial momentum.
+        cyclic_times (int): Number of cycles during training
+        step_ratio_up (float): The ratio of the increasing process of momentum
+            in  the total cycle.
+        by_epoch (bool): Whether to update momentum by epoch.
+        anneal_strategy (str, optional): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
+    """
+
+    def __init__(self,
+                 by_epoch=False,
+                 target_ratio=(0.85 / 0.95, 1),
+                 cyclic_times=1,
+                 step_ratio_up=0.4,
+                 anneal_strategy='cos',
+                 gamma=1,
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.gamma = gamma
+        self.momentum_phases = []  # init momentum_phases
+
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        # currently only support by_epoch=False
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super(CyclicMomentumUpdaterHook, self).before_run(runner)
+        # initiate momentum_phases
+        # total momentum_phases are separated as up and down
+        max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.max_iter_per_phase = max_iter_per_phase
+        self.momentum_phases.append(
+            [0, iter_up_phase, 1, self.target_ratio[0]])
+        self.momentum_phases.append([
+            iter_up_phase, max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
+        ])
+
+    def get_momentum(self, runner, base_momentum):
+        curr_iter = runner.iter % self.max_iter_per_phase
+        curr_cycle = runner.iter // self.max_iter_per_phase
+        scale = self.gamma**curr_cycle
+        for (start_iter, end_iter, start_ratio, end_ratio) \
+                in self.momentum_phases:
+            if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_momentum and base momentum. The target end_ratio
+                # can be expressed as:
+                # end_ratio = (base_momentum + scale * \
+                # (max_momentum - base_momentum)) / base_momentum
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_momentum * start_ratio,
+                                        base_momentum * end_ratio,
+                                        progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
+    """OneCycle momentum Scheduler.
+
+    This momentum scheduler usually used together with the OneCycleLrUpdater
+    to improve the performance.
+
+    Args:
+        base_momentum (float or list): Lower momentum boundaries in the cycle
+            for each parameter group. Note that momentum is cycled inversely
+            to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr'.
+            Default: 0.85
+        max_momentum (float or list): Upper momentum boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_momentum - base_momentum).
+            Note that momentum is cycled inversely
+            to learning rate; at the start of a cycle, momentum is
+            'max_momentum' and learning rate is 'base_lr'
+            Default: 0.95
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 base_momentum=0.85,
+                 max_momentum=0.95,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch=False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(base_momentum, (float, list, dict)):
+            raise ValueError('base_momentum must be the type among of float,'
+                             'list or dict.')
+        self._base_momentum = base_momentum
+        if not isinstance(max_momentum, (float, list, dict)):
+            raise ValueError('max_momentum must be the type among of float,'
+                             'list or dict.')
+        self._max_momentum = max_momentum
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('Expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must by one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.three_phase = three_phase
+        self.momentum_phases = []  # init momentum_phases
+        super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def before_run(self, runner):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                if ('momentum' not in optim.defaults
+                        and 'betas' not in optim.defaults):
+                    raise ValueError('optimizer must support momentum with'
+                                     'option enabled')
+                self.use_beta1 = 'betas' in optim.defaults
+                _base_momentum = format_param(k, optim, self._base_momentum)
+                _max_momentum = format_param(k, optim, self._max_momentum)
+                for group, b_momentum, m_momentum in zip(
+                        optim.param_groups, _base_momentum, _max_momentum):
+                    if self.use_beta1:
+                        _, beta2 = group['betas']
+                        group['betas'] = (m_momentum, beta2)
+                    else:
+                        group['momentum'] = m_momentum
+                    group['base_momentum'] = b_momentum
+                    group['max_momentum'] = m_momentum
+        else:
+            optim = runner.optimizer
+            if ('momentum' not in optim.defaults
+                    and 'betas' not in optim.defaults):
+                raise ValueError('optimizer must support momentum with'
+                                 'option enabled')
+            self.use_beta1 = 'betas' in optim.defaults
+            k = type(optim).__name__
+            _base_momentum = format_param(k, optim, self._base_momentum)
+            _max_momentum = format_param(k, optim, self._max_momentum)
+            for group, b_momentum, m_momentum in zip(optim.param_groups,
+                                                     _base_momentum,
+                                                     _max_momentum):
+                if self.use_beta1:
+                    _, beta2 = group['betas']
+                    group['betas'] = (m_momentum, beta2)
+                else:
+                    group['momentum'] = m_momentum
+                group['base_momentum'] = b_momentum
+                group['max_momentum'] = m_momentum
+
+        if self.three_phase:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter':
+                float(2 * self.pct_start * runner.max_iters) - 2,
+                'start_momentum':
+                'base_momentum',
+                'end_momentum':
+                'max_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'max_momentum',
+                'end_momentum': 'max_momentum'
+            })
+        else:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'base_momentum',
+                'end_momentum': 'max_momentum'
+            })
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, param_group):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, phase in enumerate(self.momentum_phases):
+            end_iter = phase['end_iter']
+            if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                momentum = self.anneal_func(
+                    param_group[phase['start_momentum']],
+                    param_group[phase['end_momentum']], pct)
+                break
+            start_iter = end_iter
+        return momentum
+
+    def get_regular_momentum(self, runner):
+        if isinstance(runner.optimizer, dict):
+            momentum_groups = {}
+            for k, optim in runner.optimizer.items():
+                _momentum_group = [
+                    self.get_momentum(runner, param_group)
+                    for param_group in optim.param_groups
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            momentum_groups = []
+            for param_group in runner.optimizer.param_groups:
+                momentum_groups.append(self.get_momentum(runner, param_group))
+            return momentum_groups
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/optimizer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a1459edd59db00c708a26dbc4a42d19673524
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/optimizer.py
@@ -0,0 +1,573 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import logging
+from collections import defaultdict
+from itertools import chain
+
+from torch.nn.utils import clip_grad
+
+from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
+from ..dist_utils import allreduce_grads
+from ..fp16_utils import LossScaler, wrap_fp16_model
+from .hook import HOOKS, Hook
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    # from torch.cuda.amp import GradScaler
+    from apex import amp
+except ImportError:
+    amp =None
+
+
+@HOOKS.register_module()
+class OptimizerHook(Hook):
+    """A hook contains custom operations for the optimizer.
+
+    Args:
+        grad_clip (dict, optional): A config dict to control the clip_grad.
+            Default: None.
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+                - Parameters were not used during
+                  forward pass.
+                - Parameters were not used to produce
+                  loss.
+            Default: False.
+    """
+
+    def __init__(self, grad_clip=None, detect_anomalous_params=False):
+        self.grad_clip = grad_clip
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        if self.detect_anomalous_params:
+            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
+        # runner.outputs['loss'].backward()
+        with amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+
+    def detect_anomalous_parameters(self, loss, runner):
+        logger = runner.logger
+        parameters_in_graph = set()
+        visited = set()
+
+        def traverse(grad_fn):
+            if grad_fn is None:
+                return
+            if grad_fn not in visited:
+                visited.add(grad_fn)
+                if hasattr(grad_fn, 'variable'):
+                    parameters_in_graph.add(grad_fn.variable)
+                parents = grad_fn.next_functions
+                if parents is not None:
+                    for parent in parents:
+                        grad_fn = parent[0]
+                        traverse(grad_fn)
+
+        traverse(loss.grad_fn)
+        for n, p in runner.model.named_parameters():
+            if p not in parameters_in_graph and p.requires_grad:
+                logger.log(
+                    level=logging.ERROR,
+                    msg=f'{n} with shape {p.size()} is not '
+                    f'in the computational graph \n')
+
+
+@HOOKS.register_module()
+class GradientCumulativeOptimizerHook(OptimizerHook):
+    """Optimizer Hook implements multi-iters gradient cumulating.
+
+    Args:
+        cumulative_iters (int, optional): Num of gradient cumulative iters.
+            The optimizer will step every `cumulative_iters` iters.
+            Defaults to 1.
+
+    Examples:
+        >>> # Use cumulative_iters to simulate a large batch size
+        >>> # It is helpful when the hardware cannot handle a large batch size.
+        >>> loader = DataLoader(data, batch_size=64)
+        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
+        >>> # almost equals to
+        >>> loader = DataLoader(data, batch_size=256)
+        >>> optim_hook = OptimizerHook()
+    """
+
+    def __init__(self, cumulative_iters=1, **kwargs):
+        super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
+
+        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
+            f'cumulative_iters only accepts positive int, but got ' \
+            f'{type(cumulative_iters)} instead.'
+
+        self.cumulative_iters = cumulative_iters
+        self.divisible_iters = 0
+        self.remainder_iters = 0
+        self.initialized = False
+
+    def has_batch_norm(self, module):
+        if isinstance(module, _BatchNorm):
+            return True
+        for m in module.children():
+            if self.has_batch_norm(m):
+                return True
+        return False
+
+    def _init(self, runner):
+        if runner.iter % self.cumulative_iters != 0:
+            runner.logger.warning(
+                'Resume iter number is not divisible by cumulative_iters in '
+                'GradientCumulativeOptimizerHook, which means the gradient of '
+                'some iters is lost and the result may be influenced slightly.'
+            )
+
+        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
+            runner.logger.warning(
+                'GradientCumulativeOptimizerHook may slightly decrease '
+                'performance if the model has BatchNorm layers.')
+
+        residual_iters = runner.max_iters - runner.iter
+
+        self.divisible_iters = (
+            residual_iters // self.cumulative_iters * self.cumulative_iters)
+        self.remainder_iters = residual_iters - self.divisible_iters
+
+        self.initialized = True
+
+    def after_train_iter(self, runner):
+        if not self.initialized:
+            self._init(runner)
+
+        if runner.iter < self.divisible_iters:
+            loss_factor = self.cumulative_iters
+        else:
+            loss_factor = self.remainder_iters
+        loss = runner.outputs['loss']
+        loss = loss / loss_factor
+        loss.backward()
+
+        if (self.every_n_iters(runner, self.cumulative_iters)
+                or self.is_last_iter(runner)):
+
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
+
+
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                self.loss_scaler = GradScaler()
+            elif isinstance(loss_scale, float):
+                self._scale_update_param = loss_scale
+                self.loss_scaler = GradScaler(init_scale=loss_scale)
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = GradScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training."""
+            # wrap model mode to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer to
+            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients.
+            3. Unscale the optimizer’s gradient tensors.
+            4. Call optimizer.step() and update scale factor.
+            5. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+
+            self.loss_scaler.scale(runner.outputs['loss']).backward()
+            self.loss_scaler.unscale_(runner.optimizer)
+            # grad clip
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            # backward and update scaler
+            self.loss_scaler.step(runner.optimizer)
+            self.loss_scaler.update(self._scale_update_param)
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using PyTorch's implementation) implements
+        multi-iters gradient cumulating.
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            self.loss_scaler.scale(loss).backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                self.loss_scaler.unscale_(runner.optimizer)
+
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(runner.model.parameters())
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+
+                # backward and update scaler
+                self.loss_scaler.step(runner.optimizer)
+                self.loss_scaler.update(self._scale_update_param)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
+
+else:
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (mmcv's implementation).
+
+        The steps of fp16 optimizer is as follows.
+        1. Scale the loss value.
+        2. BP in the fp16 model.
+        2. Copy gradients from fp16 model to fp32 weights.
+        3. Update fp32 weights.
+        4. Copy updated parameters from fp32 weights to fp16 model.
+
+        Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of LossScaler.
+                Defaults to 512.
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            if loss_scale == 'dynamic':
+                self.loss_scaler = LossScaler(mode='dynamic')
+            elif isinstance(loss_scale, float):
+                self.loss_scaler = LossScaler(
+                    init_scale=loss_scale, mode='static')
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = LossScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training.
+
+            1. Make a master copy of fp32 weights for optimization.
+            2. Convert the main model from fp32 to fp16.
+            """
+            # keep a copy of fp32 weights
+            old_groups = runner.optimizer.param_groups
+            runner.optimizer.param_groups = copy.deepcopy(
+                runner.optimizer.param_groups)
+            state = defaultdict(dict)
+            p_map = {
+                old_p: p
+                for old_p, p in zip(
+                    chain(*(g['params'] for g in old_groups)),
+                    chain(*(g['params']
+                            for g in runner.optimizer.param_groups)))
+            }
+            for k, v in runner.optimizer.state.items():
+                state[p_map[k]] = v
+            runner.optimizer.state = state
+            # convert model to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer `loss_scalar.py`
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients (fp16).
+            3. Copy gradients from the model to the fp32 weight copy.
+            4. Scale the gradients back and update the fp32 weight copy.
+            5. Copy back the params from fp32 weight copy to the fp16 model.
+            6. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+            # scale the loss value
+            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+            # copy fp16 grads in the model to fp32 params in the optimizer
+
+            fp32_weights = []
+            for param_group in runner.optimizer.param_groups:
+                fp32_weights += param_group['params']
+            self.copy_grads_to_fp32(runner.model, fp32_weights)
+            # allreduce grads
+            if self.distributed:
+                allreduce_grads(fp32_weights, self.coalesce,
+                                self.bucket_size_mb)
+
+            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+            # if has overflow, skip this iteration
+            if not has_overflow:
+                # scale the gradients back
+                for param in fp32_weights:
+                    if param.grad is not None:
+                        param.grad.div_(self.loss_scaler.loss_scale)
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(fp32_weights)
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+                # update fp32 params
+                runner.optimizer.step()
+                # copy fp32 params to the fp16 model
+                self.copy_params_to_fp16(runner.model, fp32_weights)
+            self.loss_scaler.update_scale(has_overflow)
+            if has_overflow:
+                runner.logger.warning('Check overflow, downscale loss scale '
+                                      f'to {self.loss_scaler.cur_scale}')
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
+        iters gradient cumulating."""
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            # scale the loss value
+            scaled_loss = loss * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                fp32_weights = []
+                for param_group in runner.optimizer.param_groups:
+                    fp32_weights += param_group['params']
+                self.copy_grads_to_fp32(runner.model, fp32_weights)
+                # allreduce grads
+                if self.distributed:
+                    allreduce_grads(fp32_weights, self.coalesce,
+                                    self.bucket_size_mb)
+
+                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+                # if has overflow, skip this iteration
+                if not has_overflow:
+                    # scale the gradients back
+                    for param in fp32_weights:
+                        if param.grad is not None:
+                            param.grad.div_(self.loss_scaler.loss_scale)
+                    if self.grad_clip is not None:
+                        grad_norm = self.clip_grads(fp32_weights)
+                        if grad_norm is not None:
+                            # Add grad norm to the logger
+                            runner.log_buffer.update(
+                                {'grad_norm': float(grad_norm)},
+                                runner.outputs['num_samples'])
+                    # update fp32 params
+                    runner.optimizer.step()
+                    # copy fp32 params to the fp16 model
+                    self.copy_params_to_fp16(runner.model, fp32_weights)
+                else:
+                    runner.logger.warning(
+                        'Check overflow, downscale loss scale '
+                        f'to {self.loss_scaler.cur_scale}')
+
+                self.loss_scaler.update_scale(has_overflow)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/profiler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..96af61bc6f9afd1828bbe924e45a809f476cbb86
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/profiler.py
@@ -0,0 +1,193 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ..dist_utils import master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """Profiler to analyze performance during training.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Default: True.
+        profile_iters (int): Number of iterations for profiling.
+            If ``by_epoch=True``, profile_iters indicates that they are the
+            first profile_iters epochs at the beginning of the
+            training, otherwise it indicates the first profile_iters
+            iterations. Default: 1.
+        activities (list[str]): List of activity groups (CPU, CUDA) to use in
+            profiling. Default: ['cpu', 'cuda'].
+        schedule (dict, optional): Config of generating the callable schedule.
+            if schedule is None, profiler will not add step markers into the
+            trace and table view. Default: None.
+        on_trace_ready (callable, dict): Either a handler or a dict of generate
+            handler. Default: None.
+        record_shapes (bool): Save information about operator's input shapes.
+            Default: False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Default: False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Default: False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Default: False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Default: None.
+
+    Example:
+        >>> runner = ... # instantiate a Runner
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace', dir_name='work_dir')
+        >>> profiler_config = dict(on_trace_ready=trace_config)
+        >>> runner.register_profiler_hook(profiler_config)
+        >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
+    """
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 profile_iters: int = 1,
+                 activities: List[str] = ['cpu', 'cuda'],
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Optional[Union[Callable, dict]] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+        try:
+            from torch import profiler  # torch version >= 1.8.1
+        except ImportError:
+            raise ImportError('profiler is the new feature of torch1.8.1, '
+                              f'but your version is {torch.__version__}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_iters < 1:
+            raise ValueError('profile_iters should be greater than 0, but got '
+                             f'{profile_iters}')
+        self.profile_iters = profile_iters
+
+        if not isinstance(activities, list):
+            raise ValueError(
+                f'activities should be list, but got {type(activities)}')
+        self.activities = []
+        for activity in activities:
+            activity = activity.lower()
+            if activity == 'cpu':
+                self.activities.append(profiler.ProfilerActivity.CPU)
+            elif activity == 'cuda':
+                self.activities.append(profiler.ProfilerActivity.CUDA)
+            else:
+                raise ValueError(
+                    f'activity should be "cpu" or "cuda", but got {activity}')
+
+        if schedule is not None:
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+        self.json_trace_path = json_trace_path
+
+    @master_only
+    def before_run(self, runner):
+        if self.by_epoch and runner.max_epochs < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_epochs}')
+
+        if not self.by_epoch and runner.max_iters < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_iters}')
+
+        if callable(self.on_trace_ready):  # handler
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):  # config of handler
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')  # log_trace handler
+            if trace_type == 'log_trace':
+
+                def _log_handler(prof):
+                    print(prof.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError('please run "pip install '
+                                      'torch-tb-profiler" to install '
+                                      'torch_tb_profiler')
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        elif self.on_trace_ready is None:
+            _on_trace_ready = None  # type: ignore
+        else:
+            raise ValueError('on_trace_ready should be handler, dict or None, '
+                             f'but got {type(self.on_trace_ready)}')
+
+        if runner.max_epochs > 1:
+            warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
+                          'instead of 1 epoch. Since profiler will slow down '
+                          'the training, it is recommended to train 1 epoch '
+                          'with ProfilerHook and adjust your setting according'
+                          ' to the profiler summary. During normal training '
+                          '(epoch > 1), you may disable the ProfilerHook.')
+
+        self.profiler = torch.profiler.profile(
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=_on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    @master_only
+    def after_train_epoch(self, runner):
+        if self.by_epoch and runner.epoch == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
+
+    @master_only
+    def after_train_iter(self, runner):
+        self.profiler.step()
+        if not self.by_epoch and runner.iter == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sampler_seed.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sampler_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..59073b436141c1a48a3524a95340c2f4b16f8241
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sampler_seed.py
@@ -0,0 +1,33 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    def before_epoch(self, runner):
+        if hasattr(runner.data_loader.sampler, 'set_epoch'):
+            # in case the data loader uses `SequentialSampler` in Pytorch
+            runner.data_loader.sampler.set_epoch(runner.epoch)
+        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sync_buffer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sync_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d2e4f62953ef6e623333118b73a2d4dc4496e95
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/hooks/sync_buffer.py
@@ -0,0 +1,35 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..dist_utils import allreduce_params
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class SyncBuffersHook(Hook):
+    """Synchronize model buffers such as running_mean and running_var in BN at
+    the end of each epoch.
+
+    Args:
+        distributed (bool): Whether distributed training is used. It is
+          effective only for distributed training. Defaults to True.
+    """
+
+    def __init__(self, distributed=True):
+        self.distributed = distributed
+
+    def after_epoch(self, runner):
+        """All-reduce model buffers at the end of each epoch."""
+        if self.distributed:
+            allreduce_params(runner.model.buffers())
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/iter_based_runner.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/iter_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..96ffd4bff50d372d4022bfe73b610c23e90488b5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/iter_based_runner.py
@@ -0,0 +1,301 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .hooks import IterTimerHook
+from .utils import get_host_info
+
+
+class IterLoader:
+
+    def __init__(self, dataloader):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._epoch = 0
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, 'set_epoch'):
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+
+        return data
+
+    def __len__(self):
+        return len(self._dataloader)
+
+
+@RUNNERS.register_module()
+class IterBasedRunner(BaseRunner):
+    """Iteration-based Runner.
+
+    This runner train models iteration by iteration.
+    """
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._epoch = data_loader.epoch
+        data_batch = next(data_loader)
+        self.call_hook('before_train_iter')
+        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.train_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_train_iter')
+        self._inner_iter += 1
+        self._iter += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        data_batch = next(data_loader)
+        self.call_hook('before_val_iter')
+        outputs = self.model.val_step(data_batch, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.val_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_val_iter')
+        self._inner_iter += 1
+
+    def run(self, data_loaders, workflow, max_iters=None, **kwargs):
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, iters) to specify the
+                running order and iterations. E.g, [('train', 10000),
+                ('val', 1000)] means running 10000 iterations for training and
+                1000 iterations for validation, iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_iters is not None:
+            warnings.warn(
+                'setting max_iters in run is deprecated, '
+                'please set max_iters in runner_config', DeprecationWarning)
+            self._max_iters = max_iters
+        assert self._max_iters is not None, (
+            'max_iters must be specified during instantiation')
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d iters', workflow,
+                         self._max_iters)
+        self.call_hook('before_run')
+
+        iter_loaders = [IterLoader(x) for x in data_loaders]
+
+        self.call_hook('before_epoch')
+
+        while self.iter < self._max_iters:
+            for i, flow in enumerate(workflow):
+                self._inner_iter = 0
+                mode, iters = flow
+                if not isinstance(mode, str) or not hasattr(self, mode):
+                    raise ValueError(
+                        'runner has no method named "{}" to run a workflow'.
+                        format(mode))
+                iter_runner = getattr(self, mode)
+                for _ in range(iters):
+                    if mode == 'train' and self.iter >= self._max_iters:
+                        break
+                    iter_runner(iter_loaders[i], **kwargs)
+#            if self.iter == 5:
+#                with torch.npu.profile("cann_profiling"):
+#                    for i, flow in enumerate(workflow):
+#                        self._inner_iter = 0
+#                        mode, iters = flow
+#                        if not isinstance(mode, str) or not hasattr(self, mode):
+#                            raise ValueError(
+#                                'runner has no method named "{}" to run a workflow'.
+#                                    format(mode))
+#                        iter_runner = getattr(self, mode)
+#                        for _ in range(iters):
+#                            if mode == 'train' and self.iter >= self._max_iters:
+#                                break
+#                            iter_runner(iter_loaders[i], **kwargs)
+#                    torch.npu.synchronize()
+#                exit()
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_epoch')
+        self.call_hook('after_run')
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        """Resume model from checkpoint.
+
+        Args:
+            checkpoint (str): Checkpoint to resume from.
+            resume_optimizer (bool, optional): Whether resume the optimizer(s)
+                if the checkpoint file includes optimizer(s). Default to True.
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default to 'default'.
+        """
+        if map_location == 'default':
+            device_id = torch.cuda.current_device()
+            checkpoint = self.load_checkpoint(
+                checkpoint,
+                map_location=lambda storage, loc: storage.cuda(device_id))
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        self._inner_iter = checkpoint['meta']['iter']
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='iter_{}.pth',
+                        meta=None,
+                        save_optimizer=True,
+                        create_symlink=True):
+        """Save checkpoint to file.
+
+        Args:
+            out_dir (str): Directory to save checkpoint files.
+            filename_tmpl (str, optional): Checkpoint file template.
+                Defaults to 'iter_{}.pth'.
+            meta (dict, optional): Metadata to be saved in checkpoint.
+                Defaults to None.
+            save_optimizer (bool, optional): Whether save optimizer.
+                Defaults to True.
+            create_symlink (bool, optional): Whether create symlink to the
+                latest checkpoint file. Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.iter + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                custom_hooks_config=None):
+        """Register default hooks for iter-based training.
+
+        Checkpoint hook, optimizer stepper hook and logger hooks will be set to
+        `by_epoch=False` by default.
+
+        Default hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        if checkpoint_config is not None:
+            checkpoint_config.setdefault('by_epoch', False)
+        if lr_config is not None:
+            lr_config.setdefault('by_epoch', False)
+        if log_config is not None:
+            for info in log_config['hooks']:
+                info.setdefault('by_epoch', False)
+        super(IterBasedRunner, self).register_training_hooks(
+            lr_config=lr_config,
+            momentum_config=momentum_config,
+            optimizer_config=optimizer_config,
+            checkpoint_config=checkpoint_config,
+            log_config=log_config,
+            timer_config=IterTimerHook(),
+            custom_hooks_config=custom_hooks_config)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/log_buffer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/log_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..911d4f8585f3f591d63ffbd846486aec59f7ebe8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/log_buffer.py
@@ -0,0 +1,54 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+
+import numpy as np
+
+
+class LogBuffer:
+
+    def __init__(self):
+        self.val_history = OrderedDict()
+        self.n_history = OrderedDict()
+        self.output = OrderedDict()
+        self.ready = False
+
+    def clear(self):
+        self.val_history.clear()
+        self.n_history.clear()
+        self.clear_output()
+
+    def clear_output(self):
+        self.output.clear()
+        self.ready = False
+
+    def update(self, vars, count=1):
+        assert isinstance(vars, dict)
+        for key, var in vars.items():
+            if key not in self.val_history:
+                self.val_history[key] = []
+                self.n_history[key] = []
+            self.val_history[key].append(var)
+            self.n_history[key].append(count)
+
+    def average(self, n=0):
+        """Average latest n values or all values."""
+        assert n >= 0
+        for key in self.val_history:
+            values = np.array(self.val_history[key][-n:])
+            nums = np.array(self.n_history[key][-n:])
+            avg = np.sum(values * nums) / np.sum(nums)
+            self.output[key] = avg
+        self.ready = True
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a5e7674cb19c27e410341221df23e0d9047d0c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/__init__.py
@@ -0,0 +1,22 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
+                      build_optimizer_constructor)
+from .default_constructor import DefaultOptimizerConstructor
+
+__all__ = [
+    'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
+    'build_optimizer', 'build_optimizer_constructor'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a6c178b1f1772b1f1f7b3733855ea31435256cd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/builder.py
@@ -0,0 +1,57 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import inspect
+
+import torch
+
+from ...utils import Registry, build_from_cfg
+
+OPTIMIZERS = Registry('optimizer')
+OPTIMIZER_BUILDERS = Registry('optimizer builder')
+
+
+def register_torch_optimizers():
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module()(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+
+def build_optimizer_constructor(cfg):
+    return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+
+
+def build_optimizer(model, cfg):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/default_constructor.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbffaa6cf8078e408ef718eb669b72b6603fd96a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/optimizer/default_constructor.py
@@ -0,0 +1,263 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch
+from torch.nn import GroupNorm, LayerNorm
+
+from mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
+from mmcv.utils.ext_loader import check_ops_exist
+from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class DefaultOptimizerConstructor:
+    """Default constructor for optimizers.
+
+    By default each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Default: False.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
+        >>> paramwise_cfg = dict(custom_keys={
+                '.backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg=None):
+        if not isinstance(optimizer_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optimizer_cfg)}')
+        self.optimizer_cfg = optimizer_cfg
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = optimizer_cfg.get('lr', None)
+        self.base_wd = optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self):
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group, param_group_list):
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+            if bypass_duplicate and self._is_in(param_group, params):
+                warnings.warn(f'{prefix} is duplicate. It is skipped since '
+                              f'bypass_duplicate={bypass_duplicate}')
+                continue
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (is_norm or is_dcn_module):
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # bias lr and decay
+                    elif name == 'bias' and not is_dcn_module:
+                        # TODO: current bias_decay_mult will have affect on DCN
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+            params.append(param_group)
+
+        if check_ops_exist():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model):
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optimizer_cfg = self.optimizer_cfg.copy()
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg['params'] = model.parameters()
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+        # set param-wise lr and weight decay recursively
+        params = []
+        self.add_params(params, model)
+        optimizer_cfg['params'] = params
+
+        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/priority.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/priority.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dae601016c29d7f2667abce692a701f2a3d57e2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/priority.py
@@ -0,0 +1,73 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority):
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..21e46cc4ccd41d5735f3db4dbad666692079b26a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/runner/utils.py
@@ -0,0 +1,106 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import sys
+import time
+import warnings
+from getpass import getuser
+from socket import gethostname
+
+import numpy as np
+import torch
+
+import mmcv
+
+
+def get_host_info():
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def get_time_str():
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+
+def obj_from_dict(info, parent=None, default_args=None):
+    """Initialize an object from dict.
+
+    The dict must contain the key "type", which indicates the object type, it
+    can be either a string or type, such as "list" or ``list``. Remaining
+    fields are treated as the arguments for constructing the object.
+
+    Args:
+        info (dict): Object types and arguments.
+        parent (:class:`module`): Module which may containing expected object
+            classes.
+        default_args (dict, optional): Default arguments for initializing the
+            object.
+
+    Returns:
+        any type: Object built from the dict.
+    """
+    assert isinstance(info, dict) and 'type' in info
+    assert isinstance(default_args, dict) or default_args is None
+    args = info.copy()
+    obj_type = args.pop('type')
+    if mmcv.is_str(obj_type):
+        if parent is not None:
+            obj_type = getattr(parent, obj_type)
+        else:
+            obj_type = sys.modules[obj_type]
+    elif not isinstance(obj_type, type):
+        raise TypeError('type must be a str or valid type, but '
+                        f'got {type(obj_type)}')
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    return obj_type(**args)
+
+
+def set_random_seed(seed, deterministic=False, use_rank_shift=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+        rank_shift (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Default: False.
+    """
+    if use_rank_shift:
+        rank, _ = mmcv.runner.get_dist_info()
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b45582030a3632fdfb67974769d1c7d1715247
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/__init__.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .init_plugins import is_tensorrt_plugin_loaded, load_tensorrt_plugin
+from .preprocess import preprocess_onnx
+
+
+def is_tensorrt_available():
+    try:
+        import tensorrt
+        del tensorrt
+        return True
+    except ModuleNotFoundError:
+        return False
+
+
+__all__ = []
+
+if is_tensorrt_available():
+    from .tensorrt_utils import (TRTWraper, TRTWrapper, load_trt_engine,
+                                 onnx2trt, save_trt_engine)
+
+    # load tensorrt plugin lib
+    load_tensorrt_plugin()
+
+    __all__.append([
+        'onnx2trt', 'save_trt_engine', 'load_trt_engine', 'TRTWraper',
+        'TRTWrapper'
+    ])
+
+__all__.append(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/init_plugins.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/init_plugins.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1937aaa69dc77d8fcf4b3f9ded3cf3d087cb8cd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/init_plugins.py
@@ -0,0 +1,50 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import glob
+import os
+
+
+def get_tensorrt_op_path():
+    """Get TensorRT plugins library path."""
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_trt.*.so')
+
+    paths = glob.glob(wildcard)
+    lib_path = paths[0] if len(paths) > 0 else ''
+    return lib_path
+
+
+plugin_is_loaded = False
+
+
+def is_tensorrt_plugin_loaded():
+    """Check if TensorRT plugins library is loaded or not.
+
+    Returns:
+        bool: plugin_is_loaded flag
+    """
+    global plugin_is_loaded
+    return plugin_is_loaded
+
+
+def load_tensorrt_plugin():
+    """load TensorRT plugins library."""
+    global plugin_is_loaded
+    lib_path = get_tensorrt_op_path()
+    if (not plugin_is_loaded) and os.path.exists(lib_path):
+        ctypes.CDLL(lib_path)
+        plugin_is_loaded = True
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/preprocess.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..f55454b444385272a781318aba730d301f11dded
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/preprocess.py
@@ -0,0 +1,134 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import onnx
+
+
+def preprocess_onnx(onnx_model):
+    """Modify onnx model to match with TensorRT plugins in mmcv.
+
+    There are some conflict between onnx node definition and TensorRT limit.
+    This function perform preprocess on the onnx model to solve the conflicts.
+    For example, onnx `attribute` is loaded in TensorRT on host and onnx
+    `input` is loaded on device. The shape inference is performed on host, so
+    any `input` related to shape (such as `max_output_boxes_per_class` in
+    NonMaxSuppression) should be transformed to `attribute` before conversion.
+
+    Arguments:
+        onnx_model (onnx.ModelProto): Input onnx model.
+
+    Returns:
+        onnx.ModelProto: Modified onnx model.
+    """
+    graph = onnx_model.graph
+    nodes = graph.node
+    initializers = graph.initializer
+    node_dict = {}
+    for node in nodes:
+        node_outputs = node.output
+        for output in node_outputs:
+            if len(output) > 0:
+                node_dict[output] = node
+
+    init_dict = {_.name: _ for _ in initializers}
+
+    nodes_name_to_remove = set()
+
+    def is_node_without_output(name):
+        for node_name, node in node_dict.items():
+            if node_name not in nodes_name_to_remove:
+                if name in node.input:
+                    return False
+        return True
+
+    def mark_nodes_to_remove(name):
+        node = node_dict[name]
+        nodes_name_to_remove.add(name)
+        for input_node_name in node.input:
+            if is_node_without_output(input_node_name):
+                mark_nodes_to_remove(input_node_name)
+
+    def parse_data(name, typ, default_value=0):
+        if name in node_dict:
+            node = node_dict[name]
+            if node.op_type == 'Constant':
+                raw_data = node.attribute[0].t.raw_data
+            else:
+                mark_nodes_to_remove(name)
+                return default_value
+        elif name in init_dict:
+            raw_data = init_dict[name].raw_data
+        else:
+            raise ValueError(f'{name} not found in node or initilizer.')
+        return np.frombuffer(raw_data, typ).item()
+
+    nrof_node = len(nodes)
+    for idx in range(nrof_node):
+        node = nodes[idx]
+        node_attributes = node.attribute
+        node_inputs = node.input
+        node_outputs = node.output
+        node_name = node.name
+        # process NonMaxSuppression node
+        if node.op_type == 'NonMaxSuppression':
+            center_point_box = 0
+            max_output_boxes_per_class = 1000000
+            iou_threshold = 0.3
+            score_threshold = 0.0
+            offset = 0
+            for attribute in node_attributes:
+                if attribute.name == 'center_point_box':
+                    center_point_box = attribute.i
+                elif attribute.name == 'offset':
+                    offset = attribute.i
+
+            if len(node_inputs) >= 3:
+                max_output_boxes_per_class = parse_data(
+                    node_inputs[2], np.int64, max_output_boxes_per_class)
+                mark_nodes_to_remove(node_inputs[2])
+
+            if len(node_inputs) >= 4:
+                iou_threshold = parse_data(node_inputs[3], np.float32,
+                                           iou_threshold)
+                mark_nodes_to_remove(node_inputs[3])
+
+            if len(node_inputs) >= 5:
+                score_threshold = parse_data(node_inputs[4], np.float32)
+                mark_nodes_to_remove(node_inputs[4])
+
+            new_node = onnx.helper.make_node(
+                'NonMaxSuppression',
+                node_inputs[:2],
+                node_outputs,
+                name=node_name,
+                center_point_box=center_point_box,
+                max_output_boxes_per_class=max_output_boxes_per_class,
+                iou_threshold=iou_threshold,
+                score_threshold=score_threshold,
+                offset=offset)
+
+            for output in node_outputs:
+                if output in node_dict:
+                    node_dict[output] = new_node
+            nodes.insert(idx, new_node)
+            nodes.remove(node)
+        elif node.op_type == 'InstanceNormalization':
+            # directly change op name
+            node.op_type = 'MMCVInstanceNormalization'
+
+    for node_name in nodes_name_to_remove:
+        nodes.remove(node_dict[node_name])
+
+    return onnx_model
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/tensorrt_utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/tensorrt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53981df38f970628394d95a179bab8a8d63eb8c6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/tensorrt/tensorrt_utils.py
@@ -0,0 +1,249 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import onnx
+import tensorrt as trt
+import torch
+
+from .preprocess import preprocess_onnx
+
+
+def onnx2trt(onnx_model,
+             opt_shape_dict,
+             log_level=trt.Logger.ERROR,
+             fp16_mode=False,
+             max_workspace_size=0,
+             device_id=0):
+    """Convert onnx model to tensorrt engine.
+
+    Arguments:
+        onnx_model (str or onnx.ModelProto): the onnx model to convert from
+        opt_shape_dict (dict): the min/opt/max shape of each input
+        log_level (TensorRT log level): the log level of TensorRT
+        fp16_mode (bool): enable fp16 mode
+        max_workspace_size (int): set max workspace size of TensorRT engine.
+            some tactic and layers need large workspace.
+        device_id (int): choice the device to create engine.
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine created from onnx_model
+
+    Example:
+        >>> engine = onnx2trt(
+        >>>             "onnx_model.onnx",
+        >>>             {'input': [[1, 3, 160, 160],
+        >>>                        [1, 3, 320, 320],
+        >>>                        [1, 3, 640, 640]]},
+        >>>             log_level=trt.Logger.WARNING,
+        >>>             fp16_mode=True,
+        >>>             max_workspace_size=1 << 30,
+        >>>             device_id=0)
+        >>>             })
+    """
+    device = torch.device('cuda:{}'.format(device_id))
+    # create builder and network
+    logger = trt.Logger(log_level)
+    builder = trt.Builder(logger)
+    EXPLICIT_BATCH = 1 << (int)(
+        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    # parse onnx
+    parser = trt.OnnxParser(network, logger)
+
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    onnx_model = preprocess_onnx(onnx_model)
+
+    if not parser.parse(onnx_model.SerializeToString()):
+        error_msgs = ''
+        for error in range(parser.num_errors):
+            error_msgs += f'{parser.get_error(error)}\n'
+        raise RuntimeError(f'parse onnx failed:\n{error_msgs}')
+
+    # config builder
+    builder.max_workspace_size = max_workspace_size
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_workspace_size
+    profile = builder.create_optimization_profile()
+
+    for input_name, param in opt_shape_dict.items():
+        min_shape = tuple(param[0][:])
+        opt_shape = tuple(param[1][:])
+        max_shape = tuple(param[2][:])
+        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
+    config.add_optimization_profile(profile)
+
+    if fp16_mode:
+        builder.fp16_mode = fp16_mode
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    # create engine
+    with torch.cuda.device(device):
+        engine = builder.build_engine(network, config)
+
+    return engine
+
+
+def save_trt_engine(engine, path):
+    """Serialize TensorRT engine to disk.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to serialize
+        path (str): disk path to write the engine
+    """
+    with open(path, mode='wb') as f:
+        f.write(bytearray(engine.serialize()))
+
+
+def load_trt_engine(path):
+    """Deserialize TensorRT engine from disk.
+
+    Arguments:
+        path (str): disk path to read the engine
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine loaded from disk
+    """
+    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+        with open(path, mode='rb') as f:
+            engine_bytes = f.read()
+        engine = runtime.deserialize_cuda_engine(engine_bytes)
+        return engine
+
+
+def torch_dtype_from_trt(dtype):
+    """Convert pytorch dtype to TensorRT dtype."""
+    if dtype == trt.bool:
+        return torch.bool
+    elif dtype == trt.int8:
+        return torch.int8
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    else:
+        raise TypeError('%s is not supported by torch' % dtype)
+
+
+def torch_device_from_trt(device):
+    """Convert pytorch device to TensorRT device."""
+    if device == trt.TensorLocation.DEVICE:
+        return torch.device('cuda')
+    elif device == trt.TensorLocation.HOST:
+        return torch.device('cpu')
+    else:
+        return TypeError('%s is not supported by torch' % device)
+
+
+class TRTWrapper(torch.nn.Module):
+    """TensorRT engine Wrapper.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to wrap
+        input_names (list[str]): names of each inputs
+        output_names (list[str]): names of each outputs
+
+    Note:
+        If the engine is converted from onnx model. The input_names and
+        output_names should be the same as onnx model.
+    """
+
+    def __init__(self, engine, input_names=None, output_names=None):
+        super(TRTWrapper, self).__init__()
+        self.engine = engine
+        if isinstance(self.engine, str):
+            self.engine = load_trt_engine(engine)
+
+        if not isinstance(self.engine, trt.ICudaEngine):
+            raise TypeError('engine should be str or trt.ICudaEngine')
+
+        self._register_state_dict_hook(TRTWrapper._on_state_dict)
+        self.context = self.engine.create_execution_context()
+
+        # get input and output names from engine
+        if input_names is None or output_names is None:
+            names = [_ for _ in self.engine]
+            input_names = list(filter(self.engine.binding_is_input, names))
+            output_names = list(set(names) - set(input_names))
+        self.input_names = input_names
+        self.output_names = output_names
+
+    def _on_state_dict(self, state_dict, prefix, local_metadata):
+        state_dict[prefix + 'engine'] = bytearray(self.engine.serialize())
+        state_dict[prefix + 'input_names'] = self.input_names
+        state_dict[prefix + 'output_names'] = self.output_names
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        engine_bytes = state_dict[prefix + 'engine']
+
+        with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+            self.engine = runtime.deserialize_cuda_engine(engine_bytes)
+            self.context = self.engine.create_execution_context()
+
+        self.input_names = state_dict[prefix + 'input_names']
+        self.output_names = state_dict[prefix + 'output_names']
+
+    def forward(self, inputs):
+        """
+        Arguments:
+            inputs (dict): dict of input name-tensors pair
+
+        Return:
+            dict: dict of output name-tensors pair
+        """
+        assert self.input_names is not None
+        assert self.output_names is not None
+        bindings = [None] * (len(self.input_names) + len(self.output_names))
+
+        for input_name, input_tensor in inputs.items():
+            idx = self.engine.get_binding_index(input_name)
+
+            if input_tensor.dtype == torch.long:
+                input_tensor = input_tensor.int()
+            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
+            bindings[idx] = input_tensor.contiguous().data_ptr()
+
+        # create output tensors
+        outputs = {}
+        for i, output_name in enumerate(self.output_names):
+            idx = self.engine.get_binding_index(output_name)
+            dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
+            shape = tuple(self.context.get_binding_shape(idx))
+
+            device = torch_device_from_trt(self.engine.get_location(idx))
+            output = torch.empty(size=shape, dtype=dtype, device=device)
+            outputs[output_name] = output
+            bindings[idx] = output.data_ptr()
+
+        self.context.execute_async_v2(bindings,
+                                      torch.cuda.current_stream().cuda_stream)
+
+        return outputs
+
+
+class TRTWraper(TRTWrapper):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TRTWraper will be deprecated in'
+            ' future. Please use TRTWrapper instead', DeprecationWarning)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e93284e9669d2b1f9519c325ec5a93f27e041f4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/__init__.py
@@ -0,0 +1,83 @@
+# flake8: noqa
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .config import Config, ConfigDict, DictAction
+from .misc import (check_prerequisites, concat_list, deprecated_api_warning,
+                   has_method, import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist,
+                   scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .testing import (assert_attrs_equal, assert_dict_contains_subset,
+                      assert_dict_has_keys, assert_is_norm_layer,
+                      assert_keys_equal, assert_params_all_zeros,
+                      check_python_script)
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
+
+try:
+    import torch
+except ImportError:
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast',
+        'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of',
+        'slice_list', 'concat_list', 'check_prerequisites', 'requires_package',
+        'requires_executable', 'is_filepath', 'fopen', 'check_file_exist',
+        'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar',
+        'track_progress', 'track_iter_progress', 'track_parallel_progress',
+        'Timer', 'TimerError', 'check_time', 'deprecated_api_warning',
+        'digit_version', 'get_git_hash', 'import_modules_from_strings',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script',
+        'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
+        'is_method_overridden', 'has_method'
+    ]
+else:
+    from .env import collect_env
+    from .logging import get_logger, print_log
+    from .parrots_jit import jit, skip_no_elena
+    from .parrots_wrapper import (
+        TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader,
+        PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
+        _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm,
+        _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home)
+    from .registry import Registry, build_from_cfg
+    from .trace import is_jit_tracing
+    from .hub import load_url
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger',
+        'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast',
+        'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
+        'check_prerequisites', 'requires_package', 'requires_executable',
+        'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist',
+        'symlink', 'scandir', 'ProgressBar', 'track_progress',
+        'track_iter_progress', 'track_parallel_progress', 'Registry',
+        'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm',
+        '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm',
+        '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd',
+        'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension',
+        'DataLoader', 'PoolDataLoader', 'TORCH_VERSION',
+        'deprecated_api_warning', 'digit_version', 'get_git_hash',
+        'import_modules_from_strings', 'jit', 'skip_no_elena',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
+        'assert_params_all_zeros', 'check_python_script',
+        'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
+        '_get_cuda_home', 'load_url', 'has_method'
+    ]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/config.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab46744acedae34cf7288e04c1a89e2d8e1de241
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/config.py
@@ -0,0 +1,704 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .misc import import_modules_from_strings
+from .path import check_file_exist
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                import mmcv
+                cfg_dict = mmcv.load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types = (dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            :obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
+        super(Config, self).__setattr__('_filename', _filename)
+        super(Config, self).__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
+        if self.filename.endswith('.py'):
+            if file is None:
+                return self.pretty_text
+            else:
+                with open(file, 'w', encoding='utf-8') as f:
+                    f.write(self.pretty_text)
+        else:
+            import mmcv
+            if file is None:
+                file_format = self.filename.split('.')[-1]
+                return mmcv.dump(cfg_dict, file_format=file_format)
+            else:
+                mmcv.dump(cfg_dict, file)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            >>> # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/env.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c8580216d11da3b84358835a6fce1330b612a65
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/env.py
@@ -0,0 +1,108 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file holding some environment constant for sharing by other files."""
+
+import os.path as osp
+import subprocess
+import sys
+from collections import defaultdict
+
+import cv2
+import torch
+
+import mmcv
+from .parrots_wrapper import get_build_config
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV: OpenCV version.
+            - MMCV: MMCV version.
+            - MMCV Compiler: The GCC version for compiling MMCV ops.
+            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
+    """
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        from mmcv.utils.parrots_wrapper import _get_cuda_home
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+    try:
+        gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+        gcc = gcc.decode('utf-8').strip()
+        env_info['GCC'] = gcc
+    except subprocess.CalledProcessError:  # gcc is unavailable
+        env_info['GCC'] = 'n/a'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    env_info['OpenCV'] = cv2.__version__
+
+    env_info['MMCV'] = mmcv.__version__
+
+    try:
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+    except ModuleNotFoundError:
+        env_info['MMCV Compiler'] = 'n/a'
+        env_info['MMCV CUDA Compiler'] = 'n/a'
+    else:
+        env_info['MMCV Compiler'] = get_compiler_version()
+        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()
+
+    return env_info
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/ext_loader.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/ext_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e4a0691d77278ccb4fde84b07affe3bdfd124f1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/ext_loader.py
@@ -0,0 +1,84 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+import pkgutil
+import warnings
+from collections import namedtuple
+
+import torch
+
+if torch.__version__ != 'parrots':
+
+    def load_ext(name, funcs):
+        ext = importlib.import_module('mmcv.' + name)
+        for fun in funcs:
+            assert hasattr(ext, fun), f'{fun} miss in module {name}'
+        return ext
+else:
+    from parrots import extension
+    from parrots.base import ParrotsException
+
+    has_return_value_ops = [
+        'nms',
+        'softnms',
+        'nms_match',
+        'nms_rotated',
+        'top_pool_forward',
+        'top_pool_backward',
+        'bottom_pool_forward',
+        'bottom_pool_backward',
+        'left_pool_forward',
+        'left_pool_backward',
+        'right_pool_forward',
+        'right_pool_backward',
+        'fused_bias_leakyrelu',
+        'upfirdn2d',
+        'ms_deform_attn_forward',
+        'pixel_group',
+        'contour_expand',
+    ]
+
+    def get_fake_func(name, e):
+
+        def fake_func(*args, **kwargs):
+            warnings.warn(f'{name} is not supported in parrots now')
+            raise e
+
+        return fake_func
+
+    def load_ext(name, funcs):
+        ExtModule = namedtuple('ExtModule', funcs)
+        ext_list = []
+        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        for fun in funcs:
+            try:
+                ext_fun = extension.load(fun, name, lib_dir=lib_root)
+            except ParrotsException as e:
+                if 'No element registered' not in e.message:
+                    warnings.warn(e.message)
+                ext_fun = get_fake_func(fun, e)
+                ext_list.append(ext_fun)
+            else:
+                if fun in has_return_value_ops:
+                    ext_list.append(ext_fun.op)
+                else:
+                    ext_list.append(ext_fun.op_)
+        return ExtModule(*ext_list)
+
+
+def check_ops_exist():
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/hub.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdeabe2a42a0f5598aa727f5d708b0e5ce96defc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/hub.py
@@ -0,0 +1,139 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .parrots_wrapper import TORCH_VERSION
+from .path import mkdir_or_exist
+from .version_utils import digit_version
+
+if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
+        '1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import torch
+    import warnings
+    from urllib.parse import urlparse
+    import sys
+    import zipfile
+    from torch.hub import download_url_to_file, _get_torch_home, HASH_REGEX
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn(
+            'Falling back to the old format < 1.6. This support will'
+            ' be deprecated in favor of default zipfile format '
+            'introduced in 1.6. Please redo torch.save() to save it '
+            'in the new zipfile format.', DeprecationWarning)
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+
+        If downloaded file is a zip file, it will be automatically decompressed
+
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Default: True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Default: False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Default: None.
+
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn(
+                'TORCH_MODEL_ZOO is deprecated, please use env '
+                'TORCH_HOME instead', DeprecationWarning)
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # noqa: F401
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/logging.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..da947e49cff9a0359b5eb723b5e60b1be5a4a521
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/logging.py
@@ -0,0 +1,123 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import torch.distributed as dist
+
+logger_initialized = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/misc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f632889ecd531ea882982b180ff9aa2eb55fd118
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/misc.py
@@ -0,0 +1,390 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections.abc
+import functools
+import itertools
+import subprocess
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec
+from itertools import repeat
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+        'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method, base_class, derived_class):
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_jit.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..542a74db958ed8ad194dede08dd587fbd261ba77
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_jit.py
@@ -0,0 +1,54 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from .parrots_wrapper import TORCH_VERSION
+
+parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
+
+if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
+    from parrots.jit import pat as jit
+else:
+
+    def jit(func=None,
+            check_input=None,
+            full_shape=True,
+            derivate=False,
+            coderize=False,
+            optimize=False):
+
+        def wrapper(func):
+
+            def wrapper_inner(*args, **kargs):
+                return func(*args, **kargs)
+
+            return wrapper_inner
+
+        if func is None:
+            return wrapper
+        else:
+            return func
+
+
+if TORCH_VERSION == 'parrots':
+    from parrots.utils.tester import skip_no_elena
+else:
+
+    def skip_no_elena(func):
+
+        def wrapper(*args, **kargs):
+            return func(*args, **kargs)
+
+        return wrapper
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_wrapper.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccf1d774b698ea04039cd2450fddb5afae4c17b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/parrots_wrapper.py
@@ -0,0 +1,120 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+
+import torch
+
+TORCH_VERSION = torch.__version__
+
+
+def is_rocm_pytorch() -> bool:
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def get_build_config():
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+
+def _get_conv():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    else:
+        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    return _ConvNd, _ConvTransposeMixin
+
+
+def _get_dataloader():
+    if TORCH_VERSION == 'parrots':
+        from torch.utils.data import DataLoader, PoolDataLoader
+    else:
+        from torch.utils.data import DataLoader
+        PoolDataLoader = DataLoader
+    return DataLoader, PoolDataLoader
+
+
+def _get_extension():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import BuildExtension, Extension
+        CppExtension = partial(Extension, cuda=False)
+        CUDAExtension = partial(Extension, cuda=True)
+    else:
+        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                               CUDAExtension)
+    return BuildExtension, CppExtension, CUDAExtension
+
+
+def _get_pool():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
+                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                             _MaxPoolNd)
+    else:
+        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
+                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                              _MaxPoolNd)
+    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
+
+
+def _get_norm():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
+    else:
+        from torch.nn.modules.instancenorm import _InstanceNorm
+        from torch.nn.modules.batchnorm import _BatchNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm
+    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
+
+
+_ConvNd, _ConvTransposeMixin = _get_conv()
+DataLoader, PoolDataLoader = _get_dataloader()
+BuildExtension, CppExtension, CUDAExtension = _get_extension()
+_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
+_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
+
+
+class SyncBatchNorm(SyncBatchNorm_):
+
+    def _check_input_dim(self, input):
+        if TORCH_VERSION == 'parrots':
+            if input.dim() < 2:
+                raise ValueError(
+                    f'expected at least 2D input (got {input.dim()}D input)')
+        else:
+            super()._check_input_dim(input)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/path.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/path.py
new file mode 100644
index 0000000000000000000000000000000000000000..061d4a54dad7d1ec1f020d3b4635b154eb9f735b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/path.py
@@ -0,0 +1,114 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | :obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Default: True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/progressbar.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/progressbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbaa9fbdd1c44bca8803da9264acfefc6786de92
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/progressbar.py
@@ -0,0 +1,221 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress."""
+
+    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks=1):
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func,
+                            tasks,
+                            nproc,
+                            initializer=None,
+                            initargs=None,
+                            bar_width=50,
+                            chunksize=1,
+                            skip_first=False,
+                            keep_order=True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks, bar_width=50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/registry.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb41363275837b1622a6d3e7a853b1433653f45c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/registry.py
@@ -0,0 +1,329 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from functools import partial
+
+from .misc import is_seq_of
+
+
+def build_from_cfg(cfg, registry, default_args=None):
+    """Build a module from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')
+
+
+class Registry:
+    """A registry to map strings to classes.
+
+    Registered object could be built from registry.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+
+    Please refer to
+    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
+    advanced usage.
+
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Example:
+            >>> # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+
+        Returns:
+            str: The inferred scope name.
+        """
+        # inspect.stack() trace where this function is called, the index-2
+        # indicates the frame where `infer_scope()` is called
+        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+
+        Return:
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    def get(self, key):
+        """Get the registry record.
+
+        Args:
+            key (str): The class name in string format.
+
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+
+    def _add_children(self, registry):
+        """Add children for a registry.
+
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(type='mmdet.ResNet'))
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    def _register_module(self, module_class, module_name=None, force=False):
+        if not inspect.isclass(module_class):
+            raise TypeError('module must be a class, '
+                            f'but got {type(module_class)}')
+
+        if module_name is None:
+            module_name = module_class.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module_class
+
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.',
+            DeprecationWarning)
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(
+                module_class=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(cls):
+            self._register_module(
+                module_class=cls, module_name=name, force=force)
+            return cls
+
+        return _register
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/testing.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..0810ce993a7153205a874ab015c4a15d83612bd7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/testing.py
@@ -0,0 +1,153 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections.abc import Iterable
+from runpy import run_path
+from shlex import split
+from typing import Any, Dict, List
+from unittest.mock import patch
+
+
+def check_python_script(cmd):
+    """Run the python cmd script with `__main__`. The difference between
+    `os.system` is that, this function exectues code in the current process, so
+    that it can be tracked by coverage tools. Currently it supports two forms:
+
+    - ./tests/data/scripts/hello.py zz
+    - python tests/data/scripts/hello.py zz
+    """
+    args = split(cmd)
+    if args[0] == 'python':
+        args = args[1:]
+    with patch.object(sys, 'argv', args):
+        run_path(args[0], run_name='__main__')
+
+
+def _any(judge_result):
+    """Since built-in ``any`` works only when the element of iterable is not
+    iterable, implement the function."""
+    if not isinstance(judge_result, Iterable):
+        return judge_result
+
+    try:
+        for element in judge_result:
+            if _any(element):
+                return True
+    except TypeError:
+        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
+        if judge_result:
+            return True
+    return False
+
+
+def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
+                                expected_subset: Dict[Any, Any]) -> bool:
+    """Check if the dict_obj contains the expected_subset.
+
+    Args:
+        dict_obj (Dict[Any, Any]): Dict object to be checked.
+        expected_subset (Dict[Any, Any]): Subset expected to be contained in
+            dict_obj.
+
+    Returns:
+        bool: Whether the dict_obj contains the expected_subset.
+    """
+
+    for key, value in expected_subset.items():
+        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
+            return False
+    return True
+
+
+def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
+    """Check if attribute of class object is correct.
+
+    Args:
+        obj (object): Class object to be checked.
+        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
+
+    Returns:
+        bool: Whether the attribute of class object is correct.
+    """
+    for attr, value in expected_attrs.items():
+        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
+            return False
+    return True
+
+
+def assert_dict_has_keys(obj: Dict[str, Any],
+                         expected_keys: List[str]) -> bool:
+    """Check if the obj has all the expected_keys.
+
+    Args:
+        obj (Dict[str, Any]): Object to be checked.
+        expected_keys (List[str]): Keys expected to contained in the keys of
+            the obj.
+
+    Returns:
+        bool: Whether the obj has the expected keys.
+    """
+    return set(expected_keys).issubset(set(obj.keys()))
+
+
+def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
+    """Check if target_keys is equal to result_keys.
+
+    Args:
+        result_keys (List[str]): Result keys to be checked.
+        target_keys (List[str]): Target keys to be checked.
+
+    Returns:
+        bool: Whether target_keys is equal to result_keys.
+    """
+    return set(result_keys) == set(target_keys)
+
+
+def assert_is_norm_layer(module) -> bool:
+    """Check if the module is a norm layer.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the module is a norm layer.
+    """
+    from .parrots_wrapper import _BatchNorm, _InstanceNorm
+    from torch.nn import GroupNorm, LayerNorm
+    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
+    return isinstance(module, norm_layer_candidates)
+
+
+def assert_params_all_zeros(module) -> bool:
+    """Check if the parameters of the module is all zeros.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the parameters of the module is all zeros.
+    """
+    weight_data = module.weight.data
+    is_weight_zero = weight_data.allclose(
+        weight_data.new_zeros(weight_data.size()))
+
+    if hasattr(module, 'bias') and module.bias is not None:
+        bias_data = module.bias.data
+        is_bias_zero = bias_data.allclose(
+            bias_data.new_zeros(bias_data.size()))
+    else:
+        is_bias_zero = True
+
+    return is_weight_zero and is_bias_zero
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/timer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..030adda52f7e0912dc12018baaca9504cfa39051
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/timer.py
@@ -0,0 +1,131 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super(TimerError, self).__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> with mmcv.Timer():
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        1.000
+        >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        it takes 1.0 seconds
+        >>> timer = mmcv.Timer()
+        >>> time.sleep(0.5)
+        >>> print(timer.since_start())
+        0.500
+        >>> time.sleep(0.5)
+        >>> print(timer.since_last_check())
+        0.500
+        >>> print(timer.since_start())
+        1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> for i in range(1, 6):
+        >>>     # simulate a code block
+        >>>     time.sleep(i)
+        >>>     mmcv.check_time('task1')
+        2.000
+        3.000
+        4.000
+        5.000
+
+    Args:
+        str: Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/trace.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..144cbeb9e159584144818a6192a039b220aa2e73
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/trace.py
@@ -0,0 +1,37 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch
+
+from mmcv.utils import digit_version
+
+
+def is_jit_tracing() -> bool:
+    if (torch.__version__ != 'parrots'
+            and digit_version(torch.__version__) >= digit_version('1.6.0')):
+        on_trace = torch.jit.is_tracing()
+        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
+        # Refers to https://github.com/pytorch/pytorch/issues/42448
+        if isinstance(on_trace, bool):
+            return on_trace
+        else:
+            return torch._C._is_tracing()
+    else:
+        warnings.warn(
+            'torch.jit.is_tracing is only supported after v1.6.0. '
+            'Therefore is_tracing returns False automatically. Please '
+            'set on_trace manually if you are using trace.', UserWarning)
+        return False
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/version_utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/version_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b277b3fd94ed7d5b418171ec6fe91f2abda6697
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/utils/version_utils.py
@@ -0,0 +1,103 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/version.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39ae071008a7bd3e60b9e7707d5ba8a9029f3ed
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/version.py
@@ -0,0 +1,48 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = '1.4.4'
+
+
+def parse_version_info(version_str: str, length: int = 4) -> tuple:
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
+            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
+    """
+    from packaging.version import parse
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        release.extend(list(version.pre))
+    elif version.is_postrelease:
+        release.extend(list(version.post))
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+version_info = tuple(int(x) for x in __version__.split('.')[:3])
+
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3148b15fb31351aadc865d94f9f6696a364461ec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .io import Cache, VideoReader, frames2video
+from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
+                      flowwrite, quantize_flow, sparse_flow_from_bytes)
+from .processing import concat_video, convert_video, cut_video, resize_video
+
+__all__ = [
+    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
+    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
+    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/io.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcba00f11da71d056be08a09045dc86ae6ab6963
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/io.py
@@ -0,0 +1,330 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+from collections import OrderedDict
+
+import cv2
+from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
+                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
+                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
+
+from mmcv.utils import (check_file_exist, mkdir_or_exist, scandir,
+                        track_progress)
+
+
+class Cache:
+
+    def __init__(self, capacity):
+        self._cache = OrderedDict()
+        self._capacity = int(capacity)
+        if capacity <= 0:
+            raise ValueError('capacity must be a positive integer')
+
+    @property
+    def capacity(self):
+        return self._capacity
+
+    @property
+    def size(self):
+        return len(self._cache)
+
+    def put(self, key, val):
+        if key in self._cache:
+            return
+        if len(self._cache) >= self.capacity:
+            self._cache.popitem(last=False)
+        self._cache[key] = val
+
+    def get(self, key, default=None):
+        val = self._cache[key] if key in self._cache else default
+        return val
+
+
+class VideoReader:
+    """Video class with similar usage to a list object.
+
+    This video warpper class provides convenient apis to access frames.
+    There exists an issue of OpenCV's VideoCapture class that jumping to a
+    certain frame may be inaccurate. It is fixed in this class by checking
+    the position after jumping each time.
+    Cache is used when decoding videos. So if the same frame is visited for
+    the second time, there is no need to decode again if it is stored in the
+    cache.
+
+    Examples:
+        >>> import mmcv
+        >>> v = mmcv.VideoReader('sample.mp4')
+        >>> len(v)  # get the total frame number with `len()`
+        120
+        >>> for img in v:  # v is iterable
+        >>>     mmcv.imshow(img)
+        >>> v[5]  # get the 6th frame
+    """
+
+    def __init__(self, filename, cache_capacity=10):
+        # Check whether the video path is a url
+        if not filename.startswith(('https://', 'http://')):
+            check_file_exist(filename, 'Video file not found: ' + filename)
+        self._vcap = cv2.VideoCapture(filename)
+        assert cache_capacity > 0
+        self._cache = Cache(cache_capacity)
+        self._position = 0
+        # get basic info
+        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
+        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
+        self._fps = self._vcap.get(CAP_PROP_FPS)
+        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
+        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
+
+    @property
+    def vcap(self):
+        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
+        return self._vcap
+
+    @property
+    def opened(self):
+        """bool: Indicate whether the video is opened."""
+        return self._vcap.isOpened()
+
+    @property
+    def width(self):
+        """int: Width of video frames."""
+        return self._width
+
+    @property
+    def height(self):
+        """int: Height of video frames."""
+        return self._height
+
+    @property
+    def resolution(self):
+        """tuple: Video resolution (width, height)."""
+        return (self._width, self._height)
+
+    @property
+    def fps(self):
+        """float: FPS of the video."""
+        return self._fps
+
+    @property
+    def frame_cnt(self):
+        """int: Total frames of the video."""
+        return self._frame_cnt
+
+    @property
+    def fourcc(self):
+        """str: "Four character code" of the video."""
+        return self._fourcc
+
+    @property
+    def position(self):
+        """int: Current cursor position, indicating frame decoded."""
+        return self._position
+
+    def _get_real_position(self):
+        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
+
+    def _set_real_position(self, frame_id):
+        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
+        pos = self._get_real_position()
+        for _ in range(frame_id - pos):
+            self._vcap.read()
+        self._position = frame_id
+
+    def read(self):
+        """Read the next frame.
+
+        If the next frame have been decoded before and in the cache, then
+        return it directly, otherwise decode, cache and return it.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        # pos = self._position
+        if self._cache:
+            img = self._cache.get(self._position)
+            if img is not None:
+                ret = True
+            else:
+                if self._position != self._get_real_position():
+                    self._set_real_position(self._position)
+                ret, img = self._vcap.read()
+                if ret:
+                    self._cache.put(self._position, img)
+        else:
+            ret, img = self._vcap.read()
+        if ret:
+            self._position += 1
+        return img
+
+    def get_frame(self, frame_id):
+        """Get frame by index.
+
+        Args:
+            frame_id (int): Index of the expected frame, 0-based.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        if frame_id < 0 or frame_id >= self._frame_cnt:
+            raise IndexError(
+                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
+        if frame_id == self._position:
+            return self.read()
+        if self._cache:
+            img = self._cache.get(frame_id)
+            if img is not None:
+                self._position = frame_id + 1
+                return img
+        self._set_real_position(frame_id)
+        ret, img = self._vcap.read()
+        if ret:
+            if self._cache:
+                self._cache.put(self._position, img)
+            self._position += 1
+        return img
+
+    def current_frame(self):
+        """Get the current frame (frame that is just visited).
+
+        Returns:
+            ndarray or None: If the video is fresh, return None, otherwise
+            return the frame.
+        """
+        if self._position == 0:
+            return None
+        return self._cache.get(self._position - 1)
+
+    def cvt2frames(self,
+                   frame_dir,
+                   file_start=0,
+                   filename_tmpl='{:06d}.jpg',
+                   start=0,
+                   max_num=0,
+                   show_progress=True):
+        """Convert a video to frame images.
+
+        Args:
+            frame_dir (str): Output directory to store all the frame images.
+            file_start (int): Filenames will start from the specified number.
+            filename_tmpl (str): Filename template with the index as the
+                placeholder.
+            start (int): The starting frame index.
+            max_num (int): Maximum number of frames to be written.
+            show_progress (bool): Whether to show a progress bar.
+        """
+        mkdir_or_exist(frame_dir)
+        if max_num == 0:
+            task_num = self.frame_cnt - start
+        else:
+            task_num = min(self.frame_cnt - start, max_num)
+        if task_num <= 0:
+            raise ValueError('start must be less than total frame number')
+        if start > 0:
+            self._set_real_position(start)
+
+        def write_frame(file_idx):
+            img = self.read()
+            if img is None:
+                return
+            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+            cv2.imwrite(filename, img)
+
+        if show_progress:
+            track_progress(write_frame, range(file_start,
+                                              file_start + task_num))
+        else:
+            for i in range(task_num):
+                write_frame(file_start + i)
+
+    def __len__(self):
+        return self.frame_cnt
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self.get_frame(i)
+                for i in range(*index.indices(self.frame_cnt))
+            ]
+        # support negative indexing
+        if index < 0:
+            index += self.frame_cnt
+            if index < 0:
+                raise IndexError('index out of range')
+        return self.get_frame(index)
+
+    def __iter__(self):
+        self._set_real_position(0)
+        return self
+
+    def __next__(self):
+        img = self.read()
+        if img is not None:
+            return img
+        else:
+            raise StopIteration
+
+    next = __next__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vcap.release()
+
+
+def frames2video(frame_dir,
+                 video_file,
+                 fps=30,
+                 fourcc='XVID',
+                 filename_tmpl='{:06d}.jpg',
+                 start=0,
+                 end=0,
+                 show_progress=True):
+    """Read the frame images from a directory and join them as a video.
+
+    Args:
+        frame_dir (str): The directory containing video frames.
+        video_file (str): Output filename.
+        fps (float): FPS of the output video.
+        fourcc (str): Fourcc of the output video, this should be compatible
+            with the output file type.
+        filename_tmpl (str): Filename template with the index as the variable.
+        start (int): Starting frame index.
+        end (int): Ending frame index.
+        show_progress (bool): Whether to show a progress bar.
+    """
+    if end == 0:
+        ext = filename_tmpl.split('.')[-1]
+        end = len([name for name in scandir(frame_dir, ext)])
+    first_file = osp.join(frame_dir, filename_tmpl.format(start))
+    check_file_exist(first_file, 'The start frame not found: ' + first_file)
+    img = cv2.imread(first_file)
+    height, width = img.shape[:2]
+    resolution = (width, height)
+    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
+                              resolution)
+
+    def write_frame(file_idx):
+        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+        img = cv2.imread(filename)
+        vwriter.write(img)
+
+    if show_progress:
+        track_progress(write_frame, range(start, end))
+    else:
+        for i in range(start, end):
+            write_frame(i)
+    vwriter.release()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/optflow.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b0ea634cb02ee5b035068a9d63788968013d60
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/optflow.py
@@ -0,0 +1,267 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import cv2
+import numpy as np
+
+from mmcv.arraymisc import dequantize, quantize
+from mmcv.image import imread, imwrite
+from mmcv.utils import is_str
+
+
+def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
+    """Read an optical flow map.
+
+    Args:
+        flow_or_path (ndarray or str): A flow map or filepath.
+        quantize (bool): whether to read quantized pair, if set to True,
+            remaining args will be passed to :func:`dequantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+
+    Returns:
+        ndarray: Optical flow represented as a (h, w, 2) numpy array
+    """
+    if isinstance(flow_or_path, np.ndarray):
+        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
+            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
+        return flow_or_path
+    elif not is_str(flow_or_path):
+        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
+                        f'not {type(flow_or_path)}')
+
+    if not quantize:
+        with open(flow_or_path, 'rb') as f:
+            try:
+                header = f.read(4).decode('utf-8')
+            except Exception:
+                raise IOError(f'Invalid flow file: {flow_or_path}')
+            else:
+                if header != 'PIEH':
+                    raise IOError(f'Invalid flow file: {flow_or_path}, '
+                                  'header does not contain PIEH')
+
+            w = np.fromfile(f, np.int32, 1).squeeze()
+            h = np.fromfile(f, np.int32, 1).squeeze()
+            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
+    else:
+        assert concat_axis in [0, 1]
+        cat_flow = imread(flow_or_path, flag='unchanged')
+        if cat_flow.ndim != 2:
+            raise IOError(
+                f'{flow_or_path} is not a valid quantized flow file, '
+                f'its dimension is {cat_flow.ndim}.')
+        assert cat_flow.shape[concat_axis] % 2 == 0
+        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
+        flow = dequantize_flow(dx, dy, *args, **kwargs)
+
+    return flow.astype(np.float32)
+
+
+def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
+    """Write optical flow to file.
+
+    If the flow is not quantized, it will be saved as a .flo file losslessly,
+    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
+    will be concatenated horizontally into a single image if quantize is True.)
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        filename (str): Output filepath.
+        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
+            images. If set to True, remaining args will be passed to
+            :func:`quantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+    """
+    if not quantize:
+        with open(filename, 'wb') as f:
+            f.write('PIEH'.encode('utf-8'))
+            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+            flow = flow.astype(np.float32)
+            flow.tofile(f)
+            f.flush()
+    else:
+        assert concat_axis in [0, 1]
+        dx, dy = quantize_flow(flow, *args, **kwargs)
+        dxdy = np.concatenate((dx, dy), axis=concat_axis)
+        imwrite(dxdy, filename)
+
+
+def quantize_flow(flow, max_val=0.02, norm=True):
+    """Quantize flow to [0, 255].
+
+    After this step, the size of flow will be much smaller, and can be
+    dumped as jpeg images.
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        max_val (float): Maximum value of flow, values beyond
+                        [-max_val, max_val] will be truncated.
+        norm (bool): Whether to divide flow values by image width/height.
+
+    Returns:
+        tuple[ndarray]: Quantized dx and dy.
+    """
+    h, w, _ = flow.shape
+    dx = flow[..., 0]
+    dy = flow[..., 1]
+    if norm:
+        dx = dx / w  # avoid inplace operations
+        dy = dy / h
+    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
+    flow_comps = [
+        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
+    ]
+    return tuple(flow_comps)
+
+
+def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
+    """Recover from quantized flow.
+
+    Args:
+        dx (ndarray): Quantized dx.
+        dy (ndarray): Quantized dy.
+        max_val (float): Maximum value used when quantizing.
+        denorm (bool): Whether to multiply flow values with width/height.
+
+    Returns:
+        ndarray: Dequantized flow.
+    """
+    assert dx.shape == dy.shape
+    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
+
+    dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]]
+
+    if denorm:
+        dx *= dx.shape[1]
+        dy *= dx.shape[0]
+    flow = np.dstack((dx, dy))
+    return flow
+
+
+def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'):
+    """Use flow to warp img.
+
+    Args:
+        img (ndarray, float or uint8): Image to be warped.
+        flow (ndarray, float): Optical Flow.
+        filling_value (int): The missing pixels will be set with filling_value.
+        interpolate_mode (str): bilinear -> Bilinear Interpolation;
+                                nearest -> Nearest Neighbor.
+
+    Returns:
+        ndarray: Warped image with the same shape of img
+    """
+    warnings.warn('This function is just for prototyping and cannot '
+                  'guarantee the computational efficiency.')
+    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
+    height = flow.shape[0]
+    width = flow.shape[1]
+    channels = img.shape[2]
+
+    output = np.ones(
+        (height, width, channels), dtype=img.dtype) * filling_value
+
+    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
+    dx = grid[:, :, 0] + flow[:, :, 1]
+    dy = grid[:, :, 1] + flow[:, :, 0]
+    sx = np.floor(dx).astype(int)
+    sy = np.floor(dy).astype(int)
+    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)
+
+    if interpolate_mode == 'nearest':
+        output[valid, :] = img[dx[valid].round().astype(int),
+                               dy[valid].round().astype(int), :]
+    elif interpolate_mode == 'bilinear':
+        # dirty walkround for integer positions
+        eps_ = 1e-6
+        dx, dy = dx + eps_, dy + eps_
+        left_top_ = img[np.floor(dx[valid]).astype(int),
+                        np.floor(dy[valid]).astype(int), :] * (
+                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                np.ceil(dy[valid]) - dy[valid])[:, None]
+        left_down_ = img[np.ceil(dx[valid]).astype(int),
+                         np.floor(dy[valid]).astype(int), :] * (
+                             dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                 np.ceil(dy[valid]) - dy[valid])[:, None]
+        right_top_ = img[np.floor(dx[valid]).astype(int),
+                         np.ceil(dy[valid]).astype(int), :] * (
+                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                 dy[valid] - np.floor(dy[valid]))[:, None]
+        right_down_ = img[np.ceil(dx[valid]).astype(int),
+                          np.ceil(dy[valid]).astype(int), :] * (
+                              dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                  dy[valid] - np.floor(dy[valid]))[:, None]
+        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
+    else:
+        raise NotImplementedError(
+            'We only support interpolation modes of nearest and bilinear, '
+            f'but got {interpolate_mode}.')
+    return output.astype(img.dtype)
+
+
+def flow_from_bytes(content):
+    """Read dense optical flow from bytes.
+
+    .. note::
+        This load optical flow function works for FlyingChairs, FlyingThings3D,
+        Sintel, FlyingChairsOcc datasets, but cannot load the data from
+        ChairsSDHom.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        ndarray: Loaded optical flow with the shape (H, W, 2).
+    """
+
+    # header in first 4 bytes
+    header = content[:4]
+    if header.decode('utf-8') != 'PIEH':
+        raise Exception('Flow file header does not contain PIEH')
+    # width in second 4 bytes
+    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
+    # height in third 4 bytes
+    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
+    # after first 12 bytes, all bytes are flow
+    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
+        (height, width, 2))
+
+    return flow
+
+
+def sparse_flow_from_bytes(content):
+    """Read the optical flow in KITTI datasets from bytes.
+
+    This function is modified from RAFT load the `KITTI datasets
+    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
+        and flow valid mask with the shape (H, W).
+    """  # nopa
+
+    content = np.frombuffer(content, np.uint8)
+    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    # flow shape (H, W, 2) valid shape (H, W)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/processing.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..865df2ab7f13b682303568e221022965f75768ec
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/video/processing.py
@@ -0,0 +1,173 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+import subprocess
+import tempfile
+
+from mmcv.utils import requires_executable
+
+
+@requires_executable('ffmpeg')
+def convert_video(in_file,
+                  out_file,
+                  print_cmd=False,
+                  pre_options='',
+                  **kwargs):
+    """Convert a video with ffmpeg.
+
+    This provides a general api to ffmpeg, the executed command is::
+
+        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`
+
+    Options(kwargs) are mapped to ffmpeg commands with the following rules:
+
+    - key=val: "-key val"
+    - key=True: "-key"
+    - key=False: ""
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        pre_options (str): Options appears before "-i <in_file>".
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = []
+    for k, v in kwargs.items():
+        if isinstance(v, bool):
+            if v:
+                options.append(f'-{k}')
+        elif k == 'log_level':
+            assert v in [
+                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
+                'verbose', 'debug', 'trace'
+            ]
+            options.append(f'-loglevel {v}')
+        else:
+            options.append(f'-{k} {v}')
+    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
+          f'{out_file}'
+    if print_cmd:
+        print(cmd)
+    subprocess.call(cmd, shell=True)
+
+
+@requires_executable('ffmpeg')
+def resize_video(in_file,
+                 out_file,
+                 size=None,
+                 ratio=None,
+                 keep_ar=False,
+                 log_level='info',
+                 print_cmd=False):
+    """Resize a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
+        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
+            (w*2, h*0.5).
+        keep_ar (bool): Whether to keep original aspect ratio.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    if size is None and ratio is None:
+        raise ValueError('expected size or ratio must be specified')
+    if size is not None and ratio is not None:
+        raise ValueError('size and ratio cannot be specified at the same time')
+    options = {'log_level': log_level}
+    if size:
+        if not keep_ar:
+            options['vf'] = f'scale={size[0]}:{size[1]}'
+        else:
+            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
+                            'force_original_aspect_ratio=decrease'
+    else:
+        if not isinstance(ratio, tuple):
+            ratio = (ratio, ratio)
+        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def cut_video(in_file,
+              out_file,
+              start=None,
+              end=None,
+              vcodec=None,
+              acodec=None,
+              log_level='info',
+              print_cmd=False):
+    """Cut a clip from a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        start (None or float): Start time (in seconds).
+        end (None or float): End time (in seconds).
+        vcodec (None or str): Output video codec, None for unchanged.
+        acodec (None or str): Output audio codec, None for unchanged.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    if start:
+        options['ss'] = start
+    else:
+        start = 0
+    if end:
+        options['t'] = end - start
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def concat_video(video_list,
+                 out_file,
+                 vcodec=None,
+                 acodec=None,
+                 log_level='info',
+                 print_cmd=False):
+    """Concatenate multiple videos into a single one.
+
+    Args:
+        video_list (list): A list of video filenames
+        out_file (str): Output video filename
+        vcodec (None or str): Output video codec, None for unchanged
+        acodec (None or str): Output audio codec, None for unchanged
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
+    with open(tmp_filename, 'w') as f:
+        for filename in video_list:
+            f.write(f'file {osp.abspath(filename)}\n')
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    convert_video(
+        tmp_filename,
+        out_file,
+        print_cmd,
+        pre_options='-f concat -safe 0',
+        **options)
+    os.close(tmp_filehandler)
+    os.remove(tmp_filename)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa510b4cbcd196da4320f2a995da17c39668318
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/__init__.py
@@ -0,0 +1,22 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .color import Color, color_val
+from .image import imshow, imshow_bboxes, imshow_det_bboxes
+from .optflow import flow2rgb, flowshow, make_color_wheel
+
+__all__ = [
+    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
+    'flowshow', 'flow2rgb', 'make_color_wheel'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/color.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c483423996dcafbf1bf4598cff14c3b0fcb38cb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/color.py
@@ -0,0 +1,64 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+
+import numpy as np
+
+from mmcv.utils import is_str
+
+
+class Color(Enum):
+    """An enum that defines common colors.
+
+    Contains red, green, blue, cyan, yellow, magenta, white and black.
+    """
+    red = (0, 0, 255)
+    green = (0, 255, 0)
+    blue = (255, 0, 0)
+    cyan = (255, 255, 0)
+    yellow = (0, 255, 255)
+    magenta = (255, 0, 255)
+    white = (255, 255, 255)
+    black = (0, 0, 0)
+
+
+def color_val(color):
+    """Convert various input to color tuples.
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[int]: A tuple of 3 integers indicating BGR channels.
+    """
+    if is_str(color):
+        return Color[color].value
+    elif isinstance(color, Color):
+        return color.value
+    elif isinstance(color, tuple):
+        assert len(color) == 3
+        for channel in color:
+            assert 0 <= channel <= 255
+        return color
+    elif isinstance(color, int):
+        assert 0 <= color <= 255
+        return color, color, color
+    elif isinstance(color, np.ndarray):
+        assert color.ndim == 1 and color.size == 3
+        assert np.all((color >= 0) & (color <= 255))
+        color = color.astype(np.uint8)
+        return tuple(color)
+    else:
+        raise TypeError(f'Invalid type for color: {type(color)}')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/image.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..181f25eecacc07633d16b651720beb050fc42f93
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/image.py
@@ -0,0 +1,165 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+
+from mmcv.image import imread, imwrite
+from .color import color_val
+
+
+def imshow(img, win_name='', wait_time=0):
+    """Show an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    cv2.imshow(win_name, imread(img))
+    if wait_time == 0:  # prevent from hanging if windows was closed
+        while True:
+            ret = cv2.waitKey(1)
+
+            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+            # if user closed window or if some key pressed
+            if closed or ret != -1:
+                break
+    else:
+        ret = cv2.waitKey(wait_time)
+
+
+def imshow_bboxes(img,
+                  bboxes,
+                  colors='green',
+                  top_k=-1,
+                  thickness=1,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
+    """Draw bboxes on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
+        colors (list[str or tuple or Color]): A list of colors.
+        top_k (int): Plot the first k bboxes only if set positive.
+        thickness (int): Thickness of lines.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if isinstance(bboxes, np.ndarray):
+        bboxes = [bboxes]
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    for i, _bboxes in enumerate(bboxes):
+        _bboxes = _bboxes.astype(np.int32)
+        if top_k <= 0:
+            _top_k = _bboxes.shape[0]
+        else:
+            _top_k = min(top_k, _bboxes.shape[0])
+        for j in range(_top_k):
+            left_top = (_bboxes[j, 0], _bboxes[j, 1])
+            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
+            cv2.rectangle(
+                img, left_top, right_bottom, colors[i], thickness=thickness)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
+
+
+def imshow_det_bboxes(img,
+                      bboxes,
+                      labels,
+                      class_names=None,
+                      score_thr=0,
+                      bbox_color='green',
+                      text_color='green',
+                      thickness=1,
+                      font_scale=0.5,
+                      show=True,
+                      win_name='',
+                      wait_time=0,
+                      out_file=None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown.
+        bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str or None): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes.ndim == 2
+    assert labels.ndim == 1
+    assert bboxes.shape[0] == labels.shape[0]
+    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if score_thr > 0:
+        assert bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+
+    bbox_color = color_val(bbox_color)
+    text_color = color_val(text_color)
+
+    for bbox, label in zip(bboxes, labels):
+        bbox_int = bbox.astype(np.int32)
+        left_top = (bbox_int[0], bbox_int[1])
+        right_bottom = (bbox_int[2], bbox_int[3])
+        cv2.rectangle(
+            img, left_top, right_bottom, bbox_color, thickness=thickness)
+        label_text = class_names[
+            label] if class_names is not None else f'cls {label}'
+        if len(bbox) > 4:
+            label_text += f'|{bbox[-1]:.02f}'
+        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
+                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/optflow.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd088da4d7d113f40f4161d0e00419d531f94d7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmcv_replace/visualization/optflow.py
@@ -0,0 +1,125 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+
+import numpy as np
+
+from mmcv.image import rgb2bgr
+from mmcv.video import flowread
+from .image import imshow
+
+
+def flowshow(flow, win_name='', wait_time=0):
+    """Show optical flow.
+
+    Args:
+        flow (ndarray or str): The optical flow to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    flow = flowread(flow)
+    flow_img = flow2rgb(flow)
+    imshow(rgb2bgr(flow_img), win_name, wait_time)
+
+
+def flow2rgb(flow, color_wheel=None, unknown_thr=1e6):
+    """Convert flow map to RGB image.
+
+    Args:
+        flow (ndarray): Array of optical flow.
+        color_wheel (ndarray or None): Color wheel used to map flow field to
+            RGB colorspace. Default color wheel will be used if not specified.
+        unknown_thr (str): Values above this threshold will be marked as
+            unknown and thus ignored.
+
+    Returns:
+        ndarray: RGB image that can be visualized.
+    """
+    assert flow.ndim == 3 and flow.shape[-1] == 2
+    if color_wheel is None:
+        color_wheel = make_color_wheel()
+    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
+    num_bins = color_wheel.shape[0]
+
+    dx = flow[:, :, 0].copy()
+    dy = flow[:, :, 1].copy()
+
+    ignore_inds = (
+        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
+        (np.abs(dy) > unknown_thr))
+    dx[ignore_inds] = 0
+    dy[ignore_inds] = 0
+
+    rad = np.sqrt(dx**2 + dy**2)
+    if np.any(rad > np.finfo(float).eps):
+        max_rad = np.max(rad)
+        dx /= max_rad
+        dy /= max_rad
+
+    rad = np.sqrt(dx**2 + dy**2)
+    angle = np.arctan2(-dy, -dx) / np.pi
+
+    bin_real = (angle + 1) / 2 * (num_bins - 1)
+    bin_left = np.floor(bin_real).astype(int)
+    bin_right = (bin_left + 1) % num_bins
+    w = (bin_real - bin_left.astype(np.float32))[..., None]
+    flow_img = (1 -
+                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
+    small_ind = rad <= 1
+    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
+    flow_img[np.logical_not(small_ind)] *= 0.75
+
+    flow_img[ignore_inds, :] = 0
+
+    return flow_img
+
+
+def make_color_wheel(bins=None):
+    """Build a color wheel.
+
+    Args:
+        bins(list or tuple, optional): Specify the number of bins for each
+            color range, corresponding to six ranges: red -> yellow,
+            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
+            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
+            (see Middlebury).
+
+    Returns:
+        ndarray: Color wheel of shape (total_bins, 3).
+    """
+    if bins is None:
+        bins = [15, 6, 4, 11, 13, 6]
+    assert len(bins) == 6
+
+    RY, YG, GC, CB, BM, MR = tuple(bins)
+
+    ry = [1, np.arange(RY) / RY, 0]
+    yg = [1 - np.arange(YG) / YG, 1, 0]
+    gc = [0, 1, np.arange(GC) / GC]
+    cb = [0, 1 - np.arange(CB) / CB, 1]
+    bm = [np.arange(BM) / BM, 0, 1]
+    mr = [1, 0, 1 - np.arange(MR) / MR]
+
+    num_bins = RY + YG + GC + CB + BM + MR
+
+    color_wheel = np.zeros((3, num_bins), dtype=np.float32)
+
+    col = 0
+    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
+        for j in range(3):
+            color_wheel[j, col:col + bins[i]] = color[j]
+        col += bins[i]
+
+    return color_wheel.T
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..561e26807de7a052b079a61e068d2278c8dcec70
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/__init__.py
@@ -0,0 +1,75 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import mmcv
+from packaging.version import parse
+
+from .version import __version__, version_info
+
+MMCV_MIN = '1.3.13'
+MMCV_MAX = '1.7.0'
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+mmcv_min_version = digit_version(MMCV_MIN)
+mmcv_max_version = digit_version(MMCV_MAX)
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_min_version}, <{mmcv_max_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccdc0ad3100d8d0ec54c7e1b732889f8dd64ceaf
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .inference import inference_segmentor, init_segmentor, show_result_pyplot
+from .test import multi_gpu_test, single_gpu_test
+from .train import (get_root_logger, init_random_seed, set_random_seed,
+                    train_segmentor)
+
+__all__ = [
+    'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
+    'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
+    'show_result_pyplot', 'init_random_seed'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/inference.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd47f123eb10e637fd72b2522316af22d19ca7e0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/inference.py
@@ -0,0 +1,158 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib.pyplot as plt
+import mmcv
+import torch
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+
+from mmseg.datasets.pipelines import Compose
+from mmseg.models import build_segmentor
+
+
+def init_segmentor(config, checkpoint=None, device='cuda:0'):
+    """Initialize a segmentor from config file.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
+            Use 'cpu' for loading model on CPU.
+    Returns:
+        nn.Module: The constructed segmentor.
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        'but got {}'.format(type(config)))
+    config.model.pretrained = None
+    config.model.train_cfg = None
+    model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        model.CLASSES = checkpoint['meta']['CLASSES']
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+class LoadImage:
+    """A simple pipeline to load image."""
+
+    def __call__(self, results):
+        """Call function to load images into results.
+
+        Args:
+            results (dict): A result dict contains the file name
+                of the image to be read.
+
+        Returns:
+            dict: ``results`` will be returned containing loaded image.
+        """
+
+        if isinstance(results['img'], str):
+            results['filename'] = results['img']
+            results['ori_filename'] = results['img']
+        else:
+            results['filename'] = None
+            results['ori_filename'] = None
+        img = mmcv.imread(results['img'])
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        return results
+
+
+def inference_segmentor(model, imgs):
+    """Inference image(s) with the segmentor.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+            images.
+
+    Returns:
+        (list[Tensor]): The segmentation result.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # build the data pipeline
+    test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = []
+    imgs = imgs if isinstance(imgs, list) else [imgs]
+    for img in imgs:
+        img_data = dict(img=img)
+        img_data = test_pipeline(img_data)
+        data.append(img_data)
+    data = collate(data, samples_per_gpu=len(imgs))
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device])[0]
+    else:
+        data['img_metas'] = [i.data[0] for i in data['img_metas']]
+
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result
+
+
+def show_result_pyplot(model,
+                       img,
+                       result,
+                       palette=None,
+                       fig_size=(15, 10),
+                       opacity=0.5,
+                       title='',
+                       block=True,
+                       out_file=None):
+    """Visualize the segmentation results on the image.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        img (str or np.ndarray): Image filename or loaded image.
+        result (list): The segmentation result.
+        palette (list[list[int]]] | None): The palette of segmentation
+            map. If None is given, random palette will be generated.
+            Default: None
+        fig_size (tuple): Figure size of the pyplot figure.
+        opacity(float): Opacity of painted segmentation map.
+            Default 0.5.
+            Must be in (0, 1] range.
+        title (str): The title of pyplot figure.
+            Default is ''.
+        block (bool): Whether to block the pyplot figure.
+            Default is True.
+        out_file (str or None): The path to write the image.
+            Default: None.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    img = model.show_result(
+        img, result, palette=palette, show=False, opacity=opacity)
+    plt.figure(figsize=fig_size)
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.title(title)
+    plt.tight_layout()
+    plt.show(block=block)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/test.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..626e05f1a387223a8157e80294e45ec8fb861a3a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/test.py
@@ -0,0 +1,246 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import tempfile
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.engine import collect_results_cpu, collect_results_gpu
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+
+def np2tmp(array, temp_file_name=None, tmpdir=None):
+    """Save ndarray to local numpy file.
+
+    Args:
+        array (ndarray): Ndarray to save.
+        temp_file_name (str): Numpy file name. If 'temp_file_name=None', this
+            function will generate a file name with tempfile.NamedTemporaryFile
+            to save ndarray. Default: None.
+        tmpdir (str): Temporary directory to save Ndarray files. Default: None.
+    Returns:
+        str: The numpy file name.
+    """
+
+    if temp_file_name is None:
+        temp_file_name = tempfile.NamedTemporaryFile(
+            suffix='.npy', delete=False, dir=tmpdir).name
+    np.save(temp_file_name, array)
+    return temp_file_name
+
+
+def single_gpu_test(model,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    efficient_test=False,
+                    opacity=0.5,
+                    pre_eval=False,
+                    format_only=False,
+                    format_args={}):
+    """Test with single GPU by progressive mode.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (utils.data.Dataloader): Pytorch data loader.
+        show (bool): Whether show results during inference. Default: False.
+        out_dir (str, optional): If specified, the results will be dumped into
+            the directory to save output results.
+        efficient_test (bool): Whether save the results as local numpy files to
+            save CPU memory during evaluation. Mutually exclusive with
+            pre_eval and format_results. Default: False.
+        opacity(float): Opacity of painted segmentation map.
+            Default 0.5.
+            Must be in (0, 1] range.
+        pre_eval (bool): Use dataset.pre_eval() function to generate
+            pre_results for metric evaluation. Mutually exclusive with
+            efficient_test and format_results. Default: False.
+        format_only (bool): Only format result for results commit.
+            Mutually exclusive with pre_eval and efficient_test.
+            Default: False.
+        format_args (dict): The args for format_results. Default: {}.
+    Returns:
+        list: list of evaluation pre-results or list of save file names.
+    """
+    if efficient_test:
+        warnings.warn(
+            'DeprecationWarning: ``efficient_test`` will be deprecated, the '
+            'evaluation is CPU memory friendly with pre_eval=True')
+        mmcv.mkdir_or_exist('.efficient_test')
+    # when none of them is set true, return segmentation results as
+    # a list of np.array.
+    assert [efficient_test, pre_eval, format_only].count(True) <= 1, \
+        '``efficient_test``, ``pre_eval`` and ``format_only`` are mutually ' \
+        'exclusive, only one of them could be true .'
+
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    # The pipeline about how the data_loader retrieval samples from dataset:
+    # sampler -> batch_sampler -> indices
+    # The indices are passed to dataset_fetcher to get data from dataset.
+    # data_fetcher -> collate_fn(dataset[index]) -> data_sample
+    # we use batch_sampler to get correct data idx
+    loader_indices = data_loader.batch_sampler
+
+    for batch_indices, data in zip(loader_indices, data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+
+        if show or out_dir:
+            img_tensor = data['img'][0]
+            img_metas = data['img_metas'][0].data[0]
+            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+            assert len(imgs) == len(img_metas)
+
+            for img, img_meta in zip(imgs, img_metas):
+                h, w, _ = img_meta['img_shape']
+                img_show = img[:h, :w, :]
+
+                ori_h, ori_w = img_meta['ori_shape'][:-1]
+                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
+
+                if out_dir:
+                    out_file = osp.join(out_dir, img_meta['ori_filename'])
+                else:
+                    out_file = None
+
+                model.module.show_result(
+                    img_show,
+                    result,
+                    palette=dataset.PALETTE,
+                    show=show,
+                    out_file=out_file,
+                    opacity=opacity)
+
+        if efficient_test:
+            result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
+
+        if format_only:
+            result = dataset.format_results(
+                result, indices=batch_indices, **format_args)
+        if pre_eval:
+            # TODO: adapt samples_per_gpu > 1.
+            # only samples_per_gpu=1 valid now
+            result = dataset.pre_eval(result, indices=batch_indices)
+            results.extend(result)
+        else:
+            results.extend(result)
+
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+
+    return results
+
+
+def multi_gpu_test(model,
+                   data_loader,
+                   tmpdir=None,
+                   gpu_collect=False,
+                   efficient_test=False,
+                   pre_eval=False,
+                   format_only=False,
+                   format_args={}):
+    """Test model with multiple gpus by progressive mode.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (utils.data.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. The same path is used for efficient
+            test. Default: None.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+            Default: False.
+        efficient_test (bool): Whether save the results as local numpy files to
+            save CPU memory during evaluation. Mutually exclusive with
+            pre_eval and format_results. Default: False.
+        pre_eval (bool): Use dataset.pre_eval() function to generate
+            pre_results for metric evaluation. Mutually exclusive with
+            efficient_test and format_results. Default: False.
+        format_only (bool): Only format result for results commit.
+            Mutually exclusive with pre_eval and efficient_test.
+            Default: False.
+        format_args (dict): The args for format_results. Default: {}.
+
+    Returns:
+        list: list of evaluation pre-results or list of save file names.
+    """
+    if efficient_test:
+        warnings.warn(
+            'DeprecationWarning: ``efficient_test`` will be deprecated, the '
+            'evaluation is CPU memory friendly with pre_eval=True')
+        mmcv.mkdir_or_exist('.efficient_test')
+    # when none of them is set true, return segmentation results as
+    # a list of np.array.
+    assert [efficient_test, pre_eval, format_only].count(True) <= 1, \
+        '``efficient_test``, ``pre_eval`` and ``format_only`` are mutually ' \
+        'exclusive, only one of them could be true .'
+
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    # The pipeline about how the data_loader retrieval samples from dataset:
+    # sampler -> batch_sampler -> indices
+    # The indices are passed to dataset_fetcher to get data from dataset.
+    # data_fetcher -> collate_fn(dataset[index]) -> data_sample
+    # we use batch_sampler to get correct data idx
+
+    # batch_sampler based on DistributedSampler, the indices only point to data
+    # samples of related machine.
+    loader_indices = data_loader.batch_sampler
+
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+
+    for batch_indices, data in zip(loader_indices, data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+
+        if efficient_test:
+            result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
+
+        if format_only:
+            result = dataset.format_results(
+                result, indices=batch_indices, **format_args)
+        if pre_eval:
+            # TODO: adapt samples_per_gpu > 1.
+            # only samples_per_gpu=1 valid now
+            result = dataset.pre_eval(result, indices=batch_indices)
+
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result) * world_size
+            for _ in range(batch_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/train.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..04765fe2a5578aaf75205052daad904758ea8312
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/apis/train.py
@@ -0,0 +1,211 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         build_runner, get_dist_info)
+from mmcv.utils import build_from_cfg
+
+from mmseg import digit_version
+from mmseg.core import DistEvalHook, EvalHook, build_optimizer
+from mmseg.datasets import build_dataloader, build_dataset
+from mmseg.utils import (build_ddp, build_dp, find_latest_checkpoint,
+                         get_root_logger)
+
+from apex import amp
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.npu.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def train_segmentor(model,
+                    dataset,
+                    cfg,
+                    distributed=False,
+                    validate=False,
+                    timestamp=None,
+                    meta=None):
+    """Launch segmentor training."""
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    # The default loader config
+    loader_cfg = dict(
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        seed=cfg.seed,
+        drop_last=True)
+    # The overall dataloader settings
+    loader_cfg.update({
+        k: v
+        for k, v in cfg.data.items() if k not in [
+            'train', 'val', 'test', 'train_dataloader', 'val_dataloader',
+            'test_dataloader'
+        ]
+    })
+
+    # The specific dataloader settings
+    train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})}
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    model = model.npu()
+    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", loss_scale=None)
+
+    # put model on devices
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # DDP wrapper
+        model = build_ddp(
+            model,
+            cfg.device,
+            device_ids=[int(os.environ['LOCAL_RANK'])],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        if not torch.npu.is_available():
+            assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \
+                'Please use MMCV >= 1.4.4 for CPU training!'
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
+
+    if cfg.get('runner') is None:
+        cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta))
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        # when distributed training by epoch, using`DistSamplerSeedHook` to set
+        # the different seed to distributed sampler for each epoch, it will
+        # shuffle dataset at each epoch and avoid overfitting.
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        # The specific dataloader settings
+        val_loader_cfg = {
+            **loader_cfg,
+            'samples_per_gpu': 1,
+            'shuffle': False,  # Not shuffle by default
+            **cfg.data.get('val_dataloader', {}),
+        }
+        val_dataloader = build_dataloader(val_dataset, **val_loader_cfg)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = DistEvalHook if distributed else EvalHook
+        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
+        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
+        runner.register_hook(
+            eval_hook(val_dataloader, **eval_cfg), priority='LOW')
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from is None and cfg.get('auto_resume'):
+        resume_from = find_latest_checkpoint(cfg.work_dir)
+        if resume_from is not None:
+            cfg.resume_from = resume_from
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c437d79b45e2e2c30d79dd2b33a3342c55a0b668
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/__init__.py
@@ -0,0 +1,25 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import (OPTIMIZER_BUILDERS, build_optimizer,
+                      build_optimizer_constructor)
+from .evaluation import *  # noqa: F401, F403
+from .hook import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .seg import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+
+__all__ = [
+    'OPTIMIZER_BUILDERS', 'build_optimizer', 'build_optimizer_constructor'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a3f1bbe349117d8a818c9771ea3eaae6437e7e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/builder.py
@@ -0,0 +1,46 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
+from mmcv.utils import Registry, build_from_cfg
+
+OPTIMIZER_BUILDERS = Registry(
+    'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS)
+
+
+def build_optimizer_constructor(cfg):
+    constructor_type = cfg.get('type')
+    if constructor_type in OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+    elif constructor_type in MMCV_OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS)
+    else:
+        raise KeyError(f'{constructor_type} is not registered '
+                       'in the optimizer builder registry.')
+
+
+def build_optimizer(model, cfg):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e62070fe169432a8e6557a394217f31e16f7ff
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .class_names import get_classes, get_palette
+from .eval_hooks import DistEvalHook, EvalHook
+from .metrics import (eval_metrics, intersect_and_union, mean_dice,
+                      mean_fscore, mean_iou, pre_eval_to_metrics)
+
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore',
+    'eval_metrics', 'get_classes', 'get_palette', 'pre_eval_to_metrics',
+    'intersect_and_union'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/class_names.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..c86442dff0878c943840043c0293c60dc1f073a6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/class_names.py
@@ -0,0 +1,329 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mmcv
+
+
+def cityscapes_classes():
+    """Cityscapes class names for external use."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def ade_classes():
+    """ADE20K class names for external use."""
+    return [
+        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+        'clock', 'flag'
+    ]
+
+
+def voc_classes():
+    """Pascal VOC class names for external use."""
+    return [
+        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+        'tvmonitor'
+    ]
+
+
+def cocostuff_classes():
+    """CocoStuff class names for external use."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+        'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+        'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+        'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net', 'paper',
+        'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
+        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
+        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
+        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
+        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+        'window-blind', 'window-other', 'wood'
+    ]
+
+
+def loveda_classes():
+    """LoveDA class names for external use."""
+    return [
+        'background', 'building', 'road', 'water', 'barren', 'forest',
+        'agricultural'
+    ]
+
+
+def potsdam_classes():
+    """Potsdam class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def vaihingen_classes():
+    """Vaihingen class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def isaid_classes():
+    """iSAID class names for external use."""
+    return [
+        'background', 'ship', 'store_tank', 'baseball_diamond', 'tennis_court',
+        'basketball_court', 'Ground_Track_Field', 'Bridge', 'Large_Vehicle',
+        'Small_Vehicle', 'Helicopter', 'Swimming_pool', 'Roundabout',
+        'Soccer_ball_field', 'plane', 'Harbor'
+    ]
+
+
+def stare_classes():
+    """stare class names for external use."""
+    return ['background', 'vessel']
+
+
+def cityscapes_palette():
+    """Cityscapes palette for external use."""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+def ade_palette():
+    """ADE20K palette for external use."""
+    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+            [102, 255, 0], [92, 0, 255]]
+
+
+def voc_palette():
+    """Pascal VOC palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+            [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+            [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+            [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+            [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+
+def cocostuff_palette():
+    """CocoStuff palette for external use."""
+    return [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+            [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+            [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+            [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+            [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+            [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+            [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+            [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+            [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+            [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+            [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128, 64],
+            [192, 0, 32], [128, 96, 0], [128, 0, 192], [0, 128, 32],
+            [64, 224, 0], [0, 0, 64], [128, 128, 160], [64, 96, 0],
+            [0, 128, 192], [0, 128, 160], [192, 224, 0], [0, 128, 64],
+            [128, 128, 32], [192, 32, 128], [0, 64, 192], [0, 0, 32],
+            [64, 160, 128], [128, 64, 64], [128, 0, 160], [64, 32, 128],
+            [128, 192, 192], [0, 0, 160], [192, 160, 128], [128, 192, 0],
+            [128, 0, 96], [192, 32, 0], [128, 64, 128], [64, 128, 96],
+            [64, 160, 0], [0, 64, 0], [192, 128, 224], [64, 32, 0],
+            [0, 192, 128], [64, 128, 224], [192, 160, 0], [0, 192, 0],
+            [192, 128, 96], [192, 96, 128], [0, 64, 128], [64, 0, 96],
+            [64, 224, 128], [128, 64, 0], [192, 0, 224], [64, 96, 128],
+            [128, 192, 128], [64, 0, 224], [192, 224, 128], [128, 192, 64],
+            [192, 0, 96], [192, 96, 0], [128, 64, 192], [0, 128, 96],
+            [0, 224, 0], [64, 64, 64], [128, 128, 224], [0, 96, 0],
+            [64, 192, 192], [0, 128, 224], [128, 224, 0], [64, 192, 64],
+            [128, 128, 96], [128, 32, 128], [64, 0, 192], [0, 64, 96],
+            [0, 160, 128], [192, 0, 64], [128, 64, 224], [0, 32, 128],
+            [192, 128, 192], [0, 64, 224], [128, 160, 128], [192, 128, 0],
+            [128, 64, 32], [128, 32, 64], [192, 0, 128], [64, 192, 32],
+            [0, 160, 64], [64, 0, 0], [192, 192, 160], [0, 32, 64],
+            [64, 128, 128], [64, 192, 160], [128, 160, 64], [64, 128, 0],
+            [192, 192, 32], [128, 96, 192], [64, 0, 128], [64, 64, 32],
+            [0, 224, 192], [192, 0, 0], [192, 64, 160], [0, 96, 192],
+            [192, 128, 128], [64, 64, 160], [128, 224, 192], [192, 128, 64],
+            [192, 64, 32], [128, 96, 64], [192, 0, 192], [0, 192, 32],
+            [64, 224, 64], [64, 0, 64], [128, 192, 160], [64, 96, 64],
+            [64, 128, 192], [0, 192, 160], [192, 224, 64], [64, 128, 64],
+            [128, 192, 32], [192, 32, 192], [64, 64, 192], [0, 64, 32],
+            [64, 160, 192], [192, 64, 64], [128, 64, 160], [64, 32, 192],
+            [192, 192, 192], [0, 64, 160], [192, 160, 192], [192, 192, 0],
+            [128, 64, 96], [192, 32, 64], [192, 64, 128], [64, 192, 96],
+            [64, 160, 64], [64, 64, 0]]
+
+
+def loveda_palette():
+    """LoveDA palette for external use."""
+    return [[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+            [159, 129, 183], [0, 255, 0], [255, 195, 128]]
+
+
+def potsdam_palette():
+    """Potsdam palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def vaihingen_palette():
+    """Vaihingen palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def isaid_palette():
+    """iSAID palette for external use."""
+    return [[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+            [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127,
+                                                       127], [0, 0, 127],
+            [0, 0, 191], [0, 0, 255], [0, 191, 127], [0, 127, 191],
+            [0, 127, 255], [0, 100, 155]]
+
+
+def stare_palette():
+    """STARE palette for external use."""
+    return [[120, 120, 120], [6, 230, 230]]
+
+
+dataset_aliases = {
+    'cityscapes': ['cityscapes'],
+    'ade': ['ade', 'ade20k'],
+    'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'],
+    'loveda': ['loveda'],
+    'potsdam': ['potsdam'],
+    'vaihingen': ['vaihingen'],
+    'cocostuff': [
+        'cocostuff', 'cocostuff10k', 'cocostuff164k', 'coco-stuff',
+        'coco-stuff10k', 'coco-stuff164k', 'coco_stuff', 'coco_stuff10k',
+        'coco_stuff164k'
+    ],
+    'isaid': ['isaid', 'iSAID'],
+    'stare': ['stare', 'STARE']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
+
+
+def get_palette(dataset):
+    """Get class palette (RGB) of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_palette()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/eval_hooks.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..7240270186eab7f3c77582b45c01b7f9761e3241
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/eval_hooks.py
@@ -0,0 +1,145 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import warnings
+
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as _DistEvalHook
+from mmcv.runner import EvalHook as _EvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EvalHook(_EvalHook):
+    """Single GPU EvalHook, with efficient test support.
+
+    Args:
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: False.
+        efficient_test (bool): Whether save the results as local numpy files to
+            save CPU memory during evaluation. Default: False.
+        pre_eval (bool): Whether to use progressive mode to evaluate model.
+            Default: False.
+    Returns:
+        list: The prediction results.
+    """
+
+    greater_keys = ['mIoU', 'mAcc', 'aAcc']
+
+    def __init__(self,
+                 *args,
+                 by_epoch=False,
+                 efficient_test=False,
+                 pre_eval=False,
+                 **kwargs):
+        super().__init__(*args, by_epoch=by_epoch, **kwargs)
+        self.pre_eval = pre_eval
+        self.latest_results = None
+
+        if efficient_test:
+            warnings.warn(
+                'DeprecationWarning: ``efficient_test`` for evaluation hook '
+                'is deprecated, the evaluation hook is CPU memory friendly '
+                'with ``pre_eval=True`` as argument for ``single_gpu_test()`` '
+                'function')
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        if not self._should_evaluate(runner):
+            return
+
+        from mmseg.apis import single_gpu_test
+        results = single_gpu_test(
+            runner.model, self.dataloader, show=False, pre_eval=self.pre_eval)
+        self.latest_results = results
+        runner.log_buffer.clear()
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        if self.save_best:
+            self._save_ckpt(runner, key_score)
+
+
+class DistEvalHook(_DistEvalHook):
+    """Distributed EvalHook, with efficient test support.
+
+    Args:
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: False.
+        efficient_test (bool): Whether save the results as local numpy files to
+            save CPU memory during evaluation. Default: False.
+        pre_eval (bool): Whether to use progressive mode to evaluate model.
+            Default: False.
+    Returns:
+        list: The prediction results.
+    """
+
+    greater_keys = ['mIoU', 'mAcc', 'aAcc']
+
+    def __init__(self,
+                 *args,
+                 by_epoch=False,
+                 efficient_test=False,
+                 pre_eval=False,
+                 **kwargs):
+        super().__init__(*args, by_epoch=by_epoch, **kwargs)
+        self.pre_eval = pre_eval
+        self.latest_results = None
+        if efficient_test:
+            warnings.warn(
+                'DeprecationWarning: ``efficient_test`` for evaluation hook '
+                'is deprecated, the evaluation hook is CPU memory friendly '
+                'with ``pre_eval=True`` as argument for ``multi_gpu_test()`` '
+                'function')
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        from mmseg.apis import multi_gpu_test
+        results = multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect,
+            pre_eval=self.pre_eval)
+        self.latest_results = results
+        runner.log_buffer.clear()
+
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/metrics.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2eebe491799472552098ddfe814a4da826339c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/evaluation/metrics.py
@@ -0,0 +1,409 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+
+
+def f_score(precision, recall, beta=1):
+    """calculate the f-score value.
+
+    Args:
+        precision (float | torch.Tensor): The precision value.
+        recall (float | torch.Tensor): The recall value.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: False.
+
+    Returns:
+        [torch.tensor]: The f-score value.
+    """
+    score = (1 + beta**2) * (precision * recall) / (
+        (beta**2 * precision) + recall)
+    return score
+
+
+def intersect_and_union(pred_label,
+                        label,
+                        num_classes,
+                        ignore_index,
+                        label_map=dict(),
+                        reduce_zero_label=False):
+    """Calculate intersection and Union.
+
+    Args:
+        pred_label (ndarray | str): Prediction segmentation map
+            or predict result filename.
+        label (ndarray | str): Ground truth segmentation map
+            or label filename.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        label_map (dict): Mapping old labels to new labels. The parameter will
+            work only when label is str. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. The parameter will
+            work only when label is str. Default: False.
+
+     Returns:
+         torch.Tensor: The intersection of prediction and ground truth
+            histogram on all classes.
+         torch.Tensor: The union of prediction and ground truth histogram on
+            all classes.
+         torch.Tensor: The prediction histogram on all classes.
+         torch.Tensor: The ground truth histogram on all classes.
+    """
+
+    if isinstance(pred_label, str):
+        pred_label = torch.from_numpy(np.load(pred_label))
+    else:
+        pred_label = torch.from_numpy((pred_label))
+
+    if isinstance(label, str):
+        label = torch.from_numpy(
+            mmcv.imread(label, flag='unchanged', backend='pillow'))
+    else:
+        label = torch.from_numpy(label)
+
+    if label_map is not None:
+        label_copy = label.clone()
+        for old_id, new_id in label_map.items():
+            label[label_copy == old_id] = new_id
+    if reduce_zero_label:
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+
+    mask = (label != ignore_index)
+    pred_label = pred_label[mask]
+    label = label[mask]
+
+    intersect = pred_label[pred_label == label]
+    area_intersect = torch.histc(
+        intersect.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_pred_label = torch.histc(
+        pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_label = torch.histc(
+        label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_union = area_pred_label + area_label - area_intersect
+    return area_intersect, area_union, area_pred_label, area_label
+
+
+def total_intersect_and_union(results,
+                              gt_seg_maps,
+                              num_classes,
+                              ignore_index,
+                              label_map=dict(),
+                              reduce_zero_label=False):
+    """Calculate Total Intersection and Union.
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str] | Iterables): list of ground
+            truth segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. Default: False.
+
+     Returns:
+         ndarray: The intersection of prediction and ground truth histogram
+             on all classes.
+         ndarray: The union of prediction and ground truth histogram on all
+             classes.
+         ndarray: The prediction histogram on all classes.
+         ndarray: The ground truth histogram on all classes.
+    """
+    total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_union = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_label = torch.zeros((num_classes, ), dtype=torch.float64)
+    for result, gt_seg_map in zip(results, gt_seg_maps):
+        area_intersect, area_union, area_pred_label, area_label = \
+            intersect_and_union(
+                result, gt_seg_map, num_classes, ignore_index,
+                label_map, reduce_zero_label)
+        total_area_intersect += area_intersect
+        total_area_union += area_union
+        total_area_pred_label += area_pred_label
+        total_area_label += area_label
+    return total_area_intersect, total_area_union, total_area_pred_label, \
+        total_area_label
+
+
+def mean_iou(results,
+             gt_seg_maps,
+             num_classes,
+             ignore_index,
+             nan_to_num=None,
+             label_map=dict(),
+             reduce_zero_label=False):
+    """Calculate Mean Intersection and Union (mIoU)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. Default: False.
+
+     Returns:
+        dict[str, float | ndarray]:
+            <aAcc> float: Overall accuracy on all images.
+            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
+            <IoU> ndarray: Per category IoU, shape (num_classes, ).
+    """
+    iou_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mIoU'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label)
+    return iou_result
+
+
+def mean_dice(results,
+              gt_seg_maps,
+              num_classes,
+              ignore_index,
+              nan_to_num=None,
+              label_map=dict(),
+              reduce_zero_label=False):
+    """Calculate Mean Dice (mDice)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. Default: False.
+
+     Returns:
+        dict[str, float | ndarray]: Default metrics.
+            <aAcc> float: Overall accuracy on all images.
+            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
+            <Dice> ndarray: Per category dice, shape (num_classes, ).
+    """
+
+    dice_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mDice'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label)
+    return dice_result
+
+
+def mean_fscore(results,
+                gt_seg_maps,
+                num_classes,
+                ignore_index,
+                nan_to_num=None,
+                label_map=dict(),
+                reduce_zero_label=False,
+                beta=1):
+    """Calculate Mean F-Score (mFscore)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. Default: False.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: False.
+
+
+     Returns:
+        dict[str, float | ndarray]: Default metrics.
+            <aAcc> float: Overall accuracy on all images.
+            <Fscore> ndarray: Per category recall, shape (num_classes, ).
+            <Precision> ndarray: Per category precision, shape (num_classes, ).
+            <Recall> ndarray: Per category f-score, shape (num_classes, ).
+    """
+    fscore_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mFscore'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label,
+        beta=beta)
+    return fscore_result
+
+
+def eval_metrics(results,
+                 gt_seg_maps,
+                 num_classes,
+                 ignore_index,
+                 metrics=['mIoU'],
+                 nan_to_num=None,
+                 label_map=dict(),
+                 reduce_zero_label=False,
+                 beta=1):
+    """Calculate evaluation metrics
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str] | Iterables): list of ground
+            truth segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Whether ignore zero label. Default: False.
+     Returns:
+        float: Overall accuracy on all images.
+        ndarray: Per category accuracy, shape (num_classes, ).
+        ndarray: Per category evaluation metrics, shape (num_classes, ).
+    """
+
+    total_area_intersect, total_area_union, total_area_pred_label, \
+        total_area_label = total_intersect_and_union(
+            results, gt_seg_maps, num_classes, ignore_index, label_map,
+            reduce_zero_label)
+    ret_metrics = total_area_to_metrics(total_area_intersect, total_area_union,
+                                        total_area_pred_label,
+                                        total_area_label, metrics, nan_to_num,
+                                        beta)
+
+    return ret_metrics
+
+
+def pre_eval_to_metrics(pre_eval_results,
+                        metrics=['mIoU'],
+                        nan_to_num=None,
+                        beta=1):
+    """Convert pre-eval results to metrics.
+
+    Args:
+        pre_eval_results (list[tuple[torch.Tensor]]): per image eval results
+            for computing evaluation metric
+        metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+     Returns:
+        float: Overall accuracy on all images.
+        ndarray: Per category accuracy, shape (num_classes, ).
+        ndarray: Per category evaluation metrics, shape (num_classes, ).
+    """
+
+    # convert list of tuples to tuple of lists, e.g.
+    # [(A_1, B_1, C_1, D_1), ...,  (A_n, B_n, C_n, D_n)] to
+    # ([A_1, ..., A_n], ..., [D_1, ..., D_n])
+    pre_eval_results = tuple(zip(*pre_eval_results))
+    assert len(pre_eval_results) == 4
+
+    total_area_intersect = sum(pre_eval_results[0])
+    total_area_union = sum(pre_eval_results[1])
+    total_area_pred_label = sum(pre_eval_results[2])
+    total_area_label = sum(pre_eval_results[3])
+
+    ret_metrics = total_area_to_metrics(total_area_intersect, total_area_union,
+                                        total_area_pred_label,
+                                        total_area_label, metrics, nan_to_num,
+                                        beta)
+
+    return ret_metrics
+
+
+def total_area_to_metrics(total_area_intersect,
+                          total_area_union,
+                          total_area_pred_label,
+                          total_area_label,
+                          metrics=['mIoU'],
+                          nan_to_num=None,
+                          beta=1):
+    """Calculate evaluation metrics
+    Args:
+        total_area_intersect (ndarray): The intersection of prediction and
+            ground truth histogram on all classes.
+        total_area_union (ndarray): The union of prediction and ground truth
+            histogram on all classes.
+        total_area_pred_label (ndarray): The prediction histogram on all
+            classes.
+        total_area_label (ndarray): The ground truth histogram on all classes.
+        metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+     Returns:
+        float: Overall accuracy on all images.
+        ndarray: Per category accuracy, shape (num_classes, ).
+        ndarray: Per category evaluation metrics, shape (num_classes, ).
+    """
+    if isinstance(metrics, str):
+        metrics = [metrics]
+    allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+    if not set(metrics).issubset(set(allowed_metrics)):
+        raise KeyError('metrics {} is not supported'.format(metrics))
+
+    all_acc = total_area_intersect.sum() / total_area_label.sum()
+    ret_metrics = OrderedDict({'aAcc': all_acc})
+    for metric in metrics:
+        if metric == 'mIoU':
+            iou = total_area_intersect / total_area_union
+            acc = total_area_intersect / total_area_label
+            ret_metrics['IoU'] = iou
+            ret_metrics['Acc'] = acc
+        elif metric == 'mDice':
+            dice = 2 * total_area_intersect / (
+                total_area_pred_label + total_area_label)
+            acc = total_area_intersect / total_area_label
+            ret_metrics['Dice'] = dice
+            ret_metrics['Acc'] = acc
+        elif metric == 'mFscore':
+            precision = total_area_intersect / total_area_pred_label
+            recall = total_area_intersect / total_area_label
+            f_value = torch.tensor(
+                [f_score(x[0], x[1], beta) for x in zip(precision, recall)])
+            ret_metrics['Fscore'] = f_value
+            ret_metrics['Precision'] = precision
+            ret_metrics['Recall'] = recall
+
+    ret_metrics = {
+        metric: value.numpy()
+        for metric, value in ret_metrics.items()
+    }
+    if nan_to_num is not None:
+        ret_metrics = OrderedDict({
+            metric: np.nan_to_num(metric_value, nan=nan_to_num)
+            for metric, metric_value in ret_metrics.items()
+        })
+    return ret_metrics
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9d3b7c420a62a7491fdd9e3d79cbfcb3ee7820
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/__init__.py
@@ -0,0 +1,17 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .wandblogger_hook import MMSegWandbHook
+
+__all__ = ['MMSegWandbHook']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/wandblogger_hook.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/wandblogger_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..706b3c5f352402b56502e3fafe1a0456e784b198
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/hook/wandblogger_hook.py
@@ -0,0 +1,379 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmcv.runner import HOOKS
+from mmcv.runner.dist_utils import master_only
+from mmcv.runner.hooks.checkpoint import CheckpointHook
+from mmcv.runner.hooks.logger.wandb import WandbLoggerHook
+
+from mmseg.core import DistEvalHook, EvalHook
+
+
+@HOOKS.register_module()
+class MMSegWandbHook(WandbLoggerHook):
+    """Enhanced Wandb logger hook for MMSegmentation.
+
+    Comparing with the :cls:`mmcv.runner.WandbLoggerHook`, this hook can not
+    only automatically log all the metrics but also log the following extra
+    information - saves model checkpoints as W&B Artifact, and
+    logs model prediction as interactive W&B Tables.
+
+    - Metrics: The MMSegWandbHook will automatically log training
+      and validation metrics along with system metrics (CPU/GPU).
+
+    - Checkpointing: If `log_checkpoint` is True, the checkpoint saved at
+      every checkpoint interval will be saved as W&B Artifacts.
+      This depends on the : class:`mmcv.runner.CheckpointHook` whose priority
+      is higher than this hook. Please refer to
+      https://docs.wandb.ai/guides/artifacts/model-versioning
+      to learn more about model versioning with W&B Artifacts.
+
+    - Checkpoint Metadata: If evaluation results are available for a given
+      checkpoint artifact, it will have a metadata associated with it.
+      The metadata contains the evaluation metrics computed on validation
+      data with that checkpoint along with the current epoch. It depends
+      on `EvalHook` whose priority is more than MMSegWandbHook.
+
+    - Evaluation: At every evaluation interval, the `MMSegWandbHook` logs the
+      model prediction as interactive W&B Tables. The number of samples
+      logged is given by `num_eval_images`. Currently, the `MMSegWandbHook`
+      logs the predicted segmentation masks along with the ground truth at
+      every evaluation interval. This depends on the `EvalHook` whose
+      priority is more than `MMSegWandbHook`. Also note that the data is just
+      logged once and subsequent evaluation tables uses reference to the
+      logged data to save memory usage. Please refer to
+      https://docs.wandb.ai/guides/data-vis to learn more about W&B Tables.
+
+    ```
+    Example:
+        log_config = dict(
+            ...
+            hooks=[
+                ...,
+                dict(type='MMSegWandbHook',
+                     init_kwargs={
+                         'entity': "YOUR_ENTITY",
+                         'project': "YOUR_PROJECT_NAME"
+                     },
+                     interval=50,
+                     log_checkpoint=True,
+                     log_checkpoint_metadata=True,
+                     num_eval_images=100,
+                     bbox_score_thr=0.3)
+            ])
+    ```
+
+    Args:
+        init_kwargs (dict): A dict passed to wandb.init to initialize
+            a W&B run. Please refer to https://docs.wandb.ai/ref/python/init
+            for possible key-value pairs.
+        interval (int): Logging interval (every k iterations).
+            Default 10.
+        log_checkpoint (bool): Save the checkpoint at every checkpoint interval
+            as W&B Artifacts. Use this for model versioning where each version
+            is a checkpoint.
+            Default: False
+        log_checkpoint_metadata (bool): Log the evaluation metrics computed
+            on the validation data with the checkpoint, along with current
+            epoch as a metadata to that checkpoint.
+            Default: True
+        num_eval_images (int): Number of validation images to be logged.
+            Default: 100
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=50,
+                 log_checkpoint=False,
+                 log_checkpoint_metadata=False,
+                 num_eval_images=100,
+                 **kwargs):
+        super(MMSegWandbHook, self).__init__(init_kwargs, interval, **kwargs)
+
+        self.log_checkpoint = log_checkpoint
+        self.log_checkpoint_metadata = (
+            log_checkpoint and log_checkpoint_metadata)
+        self.num_eval_images = num_eval_images
+        self.log_evaluation = (num_eval_images > 0)
+        self.ckpt_hook: CheckpointHook = None
+        self.eval_hook: EvalHook = None
+        self.test_fn = None
+
+    @master_only
+    def before_run(self, runner):
+        super(MMSegWandbHook, self).before_run(runner)
+
+        # Check if EvalHook and CheckpointHook are available.
+        for hook in runner.hooks:
+            if isinstance(hook, CheckpointHook):
+                self.ckpt_hook = hook
+            if isinstance(hook, EvalHook):
+                from mmseg.apis import single_gpu_test
+                self.eval_hook = hook
+                self.test_fn = single_gpu_test
+            if isinstance(hook, DistEvalHook):
+                from mmseg.apis import multi_gpu_test
+                self.eval_hook = hook
+                self.test_fn = multi_gpu_test
+
+        # Check conditions to log checkpoint
+        if self.log_checkpoint:
+            if self.ckpt_hook is None:
+                self.log_checkpoint = False
+                self.log_checkpoint_metadata = False
+                runner.logger.warning(
+                    'To log checkpoint in MMSegWandbHook, `CheckpointHook` is'
+                    'required, please check hooks in the runner.')
+            else:
+                self.ckpt_interval = self.ckpt_hook.interval
+
+        # Check conditions to log evaluation
+        if self.log_evaluation or self.log_checkpoint_metadata:
+            if self.eval_hook is None:
+                self.log_evaluation = False
+                self.log_checkpoint_metadata = False
+                runner.logger.warning(
+                    'To log evaluation or checkpoint metadata in '
+                    'MMSegWandbHook, `EvalHook` or `DistEvalHook` in mmseg '
+                    'is required, please check whether the validation '
+                    'is enabled.')
+            else:
+                self.eval_interval = self.eval_hook.interval
+                self.val_dataset = self.eval_hook.dataloader.dataset
+                # Determine the number of samples to be logged.
+                if self.num_eval_images > len(self.val_dataset):
+                    self.num_eval_images = len(self.val_dataset)
+                    runner.logger.warning(
+                        f'The num_eval_images ({self.num_eval_images}) is '
+                        'greater than the total number of validation samples '
+                        f'({len(self.val_dataset)}). The complete validation '
+                        'dataset will be logged.')
+
+        # Check conditions to log checkpoint metadata
+        if self.log_checkpoint_metadata:
+            assert self.ckpt_interval % self.eval_interval == 0, \
+                'To log checkpoint metadata in MMSegWandbHook, the interval ' \
+                f'of checkpoint saving ({self.ckpt_interval}) should be ' \
+                'divisible by the interval of evaluation ' \
+                f'({self.eval_interval}).'
+
+        # Initialize evaluation table
+        if self.log_evaluation:
+            # Initialize data table
+            self._init_data_table()
+            # Add data to the data table
+            self._add_ground_truth(runner)
+            # Log ground truth data
+            self._log_data_table()
+
+    @master_only
+    def after_train_iter(self, runner):
+        if self.get_mode(runner) == 'train':
+            # An ugly patch. The iter-based eval hook will call the
+            # `after_train_iter` method of all logger hooks before evaluation.
+            # Use this trick to skip that call.
+            # Don't call super method at first, it will clear the log_buffer
+            return super(MMSegWandbHook, self).after_train_iter(runner)
+        else:
+            super(MMSegWandbHook, self).after_train_iter(runner)
+
+        if self.by_epoch:
+            return
+
+        # Save checkpoint and metadata
+        if (self.log_checkpoint
+                and self.every_n_iters(runner, self.ckpt_interval)
+                or (self.ckpt_hook.save_last and self.is_last_iter(runner))):
+            if self.log_checkpoint_metadata and self.eval_hook:
+                metadata = {
+                    'iter': runner.iter + 1,
+                    **self._get_eval_results()
+                }
+            else:
+                metadata = None
+            aliases = [f'iter_{runner.iter+1}', 'latest']
+            model_path = osp.join(self.ckpt_hook.out_dir,
+                                  f'iter_{runner.iter+1}.pth')
+            self._log_ckpt_as_artifact(model_path, aliases, metadata)
+
+        # Save prediction table
+        if self.log_evaluation and self.eval_hook._should_evaluate(runner):
+            # Currently the results of eval_hook is not reused by wandb, so
+            # wandb will run evaluation again internally. We will consider
+            # refactoring this function afterwards
+            results = self.test_fn(runner.model, self.eval_hook.dataloader)
+            # Initialize evaluation table
+            self._init_pred_table()
+            # Log predictions
+            self._log_predictions(results, runner)
+            # Log the table
+            self._log_eval_table(runner.iter + 1)
+
+    @master_only
+    def after_run(self, runner):
+        self.wandb.finish()
+
+    def _log_ckpt_as_artifact(self, model_path, aliases, metadata=None):
+        """Log model checkpoint as  W&B Artifact.
+
+        Args:
+            model_path (str): Path of the checkpoint to log.
+            aliases (list): List of the aliases associated with this artifact.
+            metadata (dict, optional): Metadata associated with this artifact.
+        """
+        model_artifact = self.wandb.Artifact(
+            f'run_{self.wandb.run.id}_model', type='model', metadata=metadata)
+        model_artifact.add_file(model_path)
+        self.wandb.log_artifact(model_artifact, aliases=aliases)
+
+    def _get_eval_results(self):
+        """Get model evaluation results."""
+        results = self.eval_hook.latest_results
+        eval_results = self.val_dataset.evaluate(
+            results, logger='silent', **self.eval_hook.eval_kwargs)
+        return eval_results
+
+    def _init_data_table(self):
+        """Initialize the W&B Tables for validation data."""
+        columns = ['image_name', 'image']
+        self.data_table = self.wandb.Table(columns=columns)
+
+    def _init_pred_table(self):
+        """Initialize the W&B Tables for model evaluation."""
+        columns = ['image_name', 'ground_truth', 'prediction']
+        self.eval_table = self.wandb.Table(columns=columns)
+
+    def _add_ground_truth(self, runner):
+        # Get image loading pipeline
+        from mmseg.datasets.pipelines import LoadImageFromFile
+        img_loader = None
+        for t in self.val_dataset.pipeline.transforms:
+            if isinstance(t, LoadImageFromFile):
+                img_loader = t
+
+        if img_loader is None:
+            self.log_evaluation = False
+            runner.logger.warning(
+                'LoadImageFromFile is required to add images '
+                'to W&B Tables.')
+            return
+
+        # Select the images to be logged.
+        self.eval_image_indexs = np.arange(len(self.val_dataset))
+        # Set seed so that same validation set is logged each time.
+        np.random.seed(42)
+        np.random.shuffle(self.eval_image_indexs)
+        self.eval_image_indexs = self.eval_image_indexs[:self.num_eval_images]
+
+        classes = self.val_dataset.CLASSES
+        self.class_id_to_label = {id: name for id, name in enumerate(classes)}
+        self.class_set = self.wandb.Classes([{
+            'id': id,
+            'name': name
+        } for id, name in self.class_id_to_label.items()])
+
+        for idx in self.eval_image_indexs:
+            img_info = self.val_dataset.img_infos[idx]
+            image_name = img_info['filename']
+
+            # Get image and convert from BGR to RGB
+            img_meta = img_loader(
+                dict(img_info=img_info, img_prefix=self.val_dataset.img_dir))
+            image = mmcv.bgr2rgb(img_meta['img'])
+
+            # Get segmentation mask
+            seg_mask = self.val_dataset.get_gt_seg_map_by_idx(idx)
+            # Dict of masks to be logged.
+            wandb_masks = None
+            if seg_mask.ndim == 2:
+                wandb_masks = {
+                    'ground_truth': {
+                        'mask_data': seg_mask,
+                        'class_labels': self.class_id_to_label
+                    }
+                }
+
+                # Log a row to the data table.
+                self.data_table.add_data(
+                    image_name,
+                    self.wandb.Image(
+                        image, masks=wandb_masks, classes=self.class_set))
+            else:
+                runner.logger.warning(
+                    f'The segmentation mask is {seg_mask.ndim}D which '
+                    'is not supported by W&B.')
+                self.log_evaluation = False
+                return
+
+    def _log_predictions(self, results, runner):
+        table_idxs = self.data_table_ref.get_index()
+        assert len(table_idxs) == len(self.eval_image_indexs)
+        assert len(results) == len(self.val_dataset)
+
+        for ndx, eval_image_index in enumerate(self.eval_image_indexs):
+            # Get the result
+            pred_mask = results[eval_image_index]
+
+            if pred_mask.ndim == 2:
+                wandb_masks = {
+                    'prediction': {
+                        'mask_data': pred_mask,
+                        'class_labels': self.class_id_to_label
+                    }
+                }
+
+                # Log a row to the data table.
+                self.eval_table.add_data(
+                    self.data_table_ref.data[ndx][0],
+                    self.data_table_ref.data[ndx][1],
+                    self.wandb.Image(
+                        self.data_table_ref.data[ndx][1],
+                        masks=wandb_masks,
+                        classes=self.class_set))
+            else:
+                runner.logger.warning(
+                    'The predictio segmentation mask is '
+                    f'{pred_mask.ndim}D which is not supported by W&B.')
+                self.log_evaluation = False
+                return
+
+    def _log_data_table(self):
+        """Log the W&B Tables for validation data as artifact and calls
+        `use_artifact` on it so that the evaluation table can use the reference
+        of already uploaded images.
+
+        This allows the data to be uploaded just once.
+        """
+        data_artifact = self.wandb.Artifact('val', type='dataset')
+        data_artifact.add(self.data_table, 'val_data')
+
+        self.wandb.run.use_artifact(data_artifact)
+        data_artifact.wait()
+
+        self.data_table_ref = data_artifact.get('val_data')
+
+    def _log_eval_table(self, iter):
+        """Log the W&B Tables for model evaluation.
+
+        The table will be logged multiple times creating new version. Use this
+        to compare models at different intervals interactively.
+        """
+        pred_artifact = self.wandb.Artifact(
+            f'run_{self.wandb.run.id}_pred', type='evaluation')
+        pred_artifact.add(self.eval_table, 'eval_data')
+        self.wandb.run.log_artifact(pred_artifact)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4422c80f5e7322141d7d1348325693ac46d81cf8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/__init__.py
@@ -0,0 +1,20 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .layer_decay_optimizer_constructor import (
+    LayerDecayOptimizerConstructor, LearningRateDecayOptimizerConstructor)
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/layer_decay_optimizer_constructor.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b23a83a6bcfaf16b51f38558980eb3bde2dadd0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,221 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import warnings
+
+from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
+
+from mmseg.utils import get_root_logger
+from ..builder import OPTIMIZER_BUILDERS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+def get_layer_id_for_vit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.layers'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id - 1
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for ConvNeXt,
+    BEiT and MAE.
+    """
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = get_root_logger()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                elif 'BEiT' in module.backbone.__class__.__name__ or \
+                     'MAE' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LayerDecayOptimizerConstructor(LearningRateDecayOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for BEiT,
+    and it will be deprecated.
+    Please use ``LearningRateDecayOptimizerConstructor`` instead.
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg):
+        warnings.warn('DeprecationWarning: Original '
+                      'LayerDecayOptimizerConstructor of BEiT '
+                      'will be deprecated. Please use '
+                      'LearningRateDecayOptimizerConstructor instead, '
+                      'and set decay_type = layer_wise_vit in paramwise_cfg.')
+        paramwise_cfg.update({'decay_type': 'layer_wise_vit'})
+        warnings.warn('DeprecationWarning: Layer_decay_rate will '
+                      'be deleted, please use decay_rate instead.')
+        paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
+        super(LayerDecayOptimizerConstructor,
+              self).__init__(optimizer_cfg, paramwise_cfg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba834ada8a369c900729fa26319a25962ef6a32
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/__init__.py
@@ -0,0 +1,18 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import build_pixel_sampler
+from .sampler import BasePixelSampler, OHEMPixelSampler
+
+__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d78f591e0dc86cedd7650626daaa2960597795
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/builder.py
@@ -0,0 +1,22 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.utils import Registry, build_from_cfg
+
+PIXEL_SAMPLERS = Registry('pixel sampler')
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a46cdd8c8761351ceedf3d11caebc5d153d478
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/__init__.py
@@ -0,0 +1,18 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_pixel_sampler import BasePixelSampler
+from .ohem_pixel_sampler import OHEMPixelSampler
+
+__all__ = ['BasePixelSampler', 'OHEMPixelSampler']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/base_pixel_sampler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/base_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6393ebf83426225994d9e068ce1696fe9c261b01
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/base_pixel_sampler.py
@@ -0,0 +1,26 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta, abstractmethod
+
+
+class BasePixelSampler(metaclass=ABCMeta):
+    """Base class of pixel sampler."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def sample(self, seg_logit, seg_label):
+        """Placeholder for sample function."""
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/ohem_pixel_sampler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/ohem_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea416de9b0e6f0be64d4322375ba176c1f0239e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/seg/sampler/ohem_pixel_sampler.py
@@ -0,0 +1,98 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import PIXEL_SAMPLERS
+from .base_pixel_sampler import BasePixelSampler
+
+
+@PIXEL_SAMPLERS.register_module()
+class OHEMPixelSampler(BasePixelSampler):
+    """Online Hard Example Mining Sampler for segmentation.
+
+    Args:
+        context (nn.Module): The context of sampler, subclass of
+            :obj:`BaseDecodeHead`.
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: None.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+    """
+
+    def __init__(self, context, thresh=None, min_kept=100000):
+        super(OHEMPixelSampler, self).__init__()
+        self.context = context
+        assert min_kept > 1
+        self.thresh = thresh
+        self.min_kept = min_kept
+
+    def sample(self, seg_logit, seg_label):
+        """Sample pixels that have high loss or with low prediction confidence.
+
+        Args:
+            seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
+            seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W)
+
+        Returns:
+            torch.Tensor: segmentation weight, shape (N, H, W)
+        """
+        with torch.no_grad():
+            assert seg_logit.shape[2:] == seg_label.shape[2:]
+            assert seg_label.shape[1] == 1
+            seg_label = seg_label.squeeze(1).long()
+            batch_kept = self.min_kept * seg_label.size(0)
+            valid_mask = seg_label != self.context.ignore_index
+            seg_weight = seg_logit.new_zeros(size=seg_label.size())
+            valid_seg_weight = seg_weight[valid_mask]
+            if self.thresh is not None:
+                seg_prob = F.softmax(seg_logit, dim=1)
+
+                tmp_seg_label = seg_label.clone().unsqueeze(1)
+                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
+                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
+                sort_prob, sort_indices = seg_prob[valid_mask].sort()
+
+                if sort_prob.numel() > 0:
+                    min_threshold = sort_prob[min(batch_kept,
+                                                  sort_prob.numel() - 1)]
+                else:
+                    min_threshold = 0.0
+                threshold = max(min_threshold, self.thresh)
+                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
+            else:
+                if not isinstance(self.context.loss_decode, nn.ModuleList):
+                    losses_decode = [self.context.loss_decode]
+                else:
+                    losses_decode = self.context.loss_decode
+                losses = 0.0
+                for loss_module in losses_decode:
+                    losses += loss_module(
+                        seg_logit,
+                        seg_label,
+                        weight=None,
+                        ignore_index=self.context.ignore_index,
+                        reduction_override='none')
+
+                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
+                _, sort_indices = losses[valid_mask].sort(descending=True)
+                valid_seg_weight[sort_indices[:batch_kept]] = 1.
+
+            seg_weight[valid_mask] = valid_seg_weight
+
+            return seg_weight
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..278b5d5b7823930690e8e2fa106b558cb821fc90
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/__init__.py
@@ -0,0 +1,18 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dist_util import check_dist_init, sync_random_seed
+from .misc import add_prefix
+
+__all__ = ['add_prefix', 'check_dist_init', 'sync_random_seed']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/dist_util.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/dist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5ea0ae8eac33af44ab9f57152949979a8ef808
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/dist_util.py
@@ -0,0 +1,59 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def check_dist_init():
+    return dist.is_available() and dist.is_initialized()
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed. All workers must call
+    this function, otherwise it will deadlock. This method is generally used in
+    `DistributedSampler`, because the seed should be identical across all
+    processes in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/misc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa2239586960d2f0dc1ce5b14dd6b8b8d043368a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/core/utils/misc.py
@@ -0,0 +1,31 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0503c65dcb91568ad2a0de377eccebcf6d1552
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/__init__.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ade import ADE20KDataset
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .chase_db1 import ChaseDB1Dataset
+from .cityscapes import CityscapesDataset
+from .coco_stuff import COCOStuffDataset
+from .custom import CustomDataset
+from .dark_zurich import DarkZurichDataset
+from .dataset_wrappers import (ConcatDataset, MultiImageMixDataset,
+                               RepeatDataset)
+from .drive import DRIVEDataset
+from .hrf import HRFDataset
+from .isaid import iSAIDDataset
+from .isprs import ISPRSDataset
+from .loveda import LoveDADataset
+from .night_driving import NightDrivingDataset
+from .pascal_context import PascalContextDataset, PascalContextDataset59
+from .potsdam import PotsdamDataset
+from .stare import STAREDataset
+from .voc import PascalVOCDataset
+
+__all__ = [
+    'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
+    'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset',
+    'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset',
+    'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset',
+    'STAREDataset', 'DarkZurichDataset', 'NightDrivingDataset',
+    'COCOStuffDataset', 'LoveDADataset', 'MultiImageMixDataset',
+    'iSAIDDataset', 'ISPRSDataset', 'PotsdamDataset'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/ade.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/ade.py
new file mode 100644
index 0000000000000000000000000000000000000000..894fd02e1374e67bffd42d5cd3bc92435c3dffa4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/ade.py
@@ -0,0 +1,180 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class ADE20KDataset(CustomDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    CLASSES = (
+        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+        'clock', 'flag')
+
+    PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+               [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+               [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+               [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+               [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+               [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+               [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+               [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+               [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+               [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+               [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+               [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+               [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+               [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+               [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+               [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+               [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+               [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+               [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+               [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+               [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+               [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+               [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+               [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+               [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+               [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+               [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+               [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+               [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+               [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+               [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+               [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+               [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+               [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+               [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+               [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+               [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+               [102, 255, 0], [92, 0, 255]]
+
+    def __init__(self, **kwargs):
+        super(ADE20KDataset, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            reduce_zero_label=True,
+            **kwargs)
+
+    def results2img(self, results, imgfile_prefix, to_label_id, indices=None):
+        """Write the segmentation results to images.
+
+        Args:
+            results (list[ndarray]): Testing results of the
+                dataset.
+            imgfile_prefix (str): The filename prefix of the png files.
+                If the prefix is "somepath/xxx",
+                the png files will be named "somepath/xxx.png".
+            to_label_id (bool): whether convert output to label_id for
+                submission.
+            indices (list[int], optional): Indices of input results, if not
+                set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            list[str: str]: result txt files which contains corresponding
+            semantic segmentation images.
+        """
+        if indices is None:
+            indices = list(range(len(self)))
+
+        mmcv.mkdir_or_exist(imgfile_prefix)
+        result_files = []
+        for result, idx in zip(results, indices):
+
+            filename = self.img_infos[idx]['filename']
+            basename = osp.splitext(osp.basename(filename))[0]
+
+            png_filename = osp.join(imgfile_prefix, f'{basename}.png')
+
+            # The  index range of official requirement is from 0 to 150.
+            # But the index range of output is from 0 to 149.
+            # That is because we set reduce_zero_label=True.
+            result = result + 1
+
+            output = Image.fromarray(result.astype(np.uint8))
+            output.save(png_filename)
+            result_files.append(png_filename)
+
+        return result_files
+
+    def format_results(self,
+                       results,
+                       imgfile_prefix,
+                       to_label_id=True,
+                       indices=None):
+        """Format the results into dir (standard format for ade20k evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            imgfile_prefix (str | None): The prefix of images files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix".
+            to_label_id (bool): whether convert output to label_id for
+                submission. Default: False
+            indices (list[int], optional): Indices of input results, if not
+                set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a list containing
+               the image paths, tmp_dir is the temporal directory created
+                for saving json/png files when img_prefix is not specified.
+        """
+
+        if indices is None:
+            indices = list(range(len(self)))
+
+        assert isinstance(results, list), 'results must be a list.'
+        assert isinstance(indices, list), 'indices must be a list.'
+
+        result_files = self.results2img(results, imgfile_prefix, to_label_id,
+                                        indices)
+        return result_files
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c529a20f57aad78decd9adfb4214423c8248dca4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/builder.py
@@ -0,0 +1,204 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+import torch
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg, digit_version
+from torch.utils.data import DataLoader
+
+from .samplers import DistributedSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def _concat_dataset(cfg, default_args=None):
+    """Build :obj:`ConcatDataset by."""
+    from .dataset_wrappers import ConcatDataset
+    img_dir = cfg['img_dir']
+    ann_dir = cfg.get('ann_dir', None)
+    split = cfg.get('split', None)
+    # pop 'separate_eval' since it is not a valid key for common datasets.
+    separate_eval = cfg.pop('separate_eval', True)
+    num_img_dir = len(img_dir) if isinstance(img_dir, (list, tuple)) else 1
+    if ann_dir is not None:
+        num_ann_dir = len(ann_dir) if isinstance(ann_dir, (list, tuple)) else 1
+    else:
+        num_ann_dir = 0
+    if split is not None:
+        num_split = len(split) if isinstance(split, (list, tuple)) else 1
+    else:
+        num_split = 0
+    if num_img_dir > 1:
+        assert num_img_dir == num_ann_dir or num_ann_dir == 0
+        assert num_img_dir == num_split or num_split == 0
+    else:
+        assert num_split == num_ann_dir or num_ann_dir <= 1
+    num_dset = max(num_split, num_img_dir)
+
+    datasets = []
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        if isinstance(img_dir, (list, tuple)):
+            data_cfg['img_dir'] = img_dir[i]
+        if isinstance(ann_dir, (list, tuple)):
+            data_cfg['ann_dir'] = ann_dir[i]
+        if isinstance(split, (list, tuple)):
+            data_cfg['split'] = split[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets, separate_eval)
+
+
+def build_dataset(cfg, default_args=None):
+    """Build datasets."""
+    from .dataset_wrappers import (ConcatDataset, MultiImageMixDataset,
+                                   RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'MultiImageMixDataset':
+        cp_cfg = copy.deepcopy(cfg)
+        cp_cfg['dataset'] = build_dataset(cp_cfg['dataset'])
+        cp_cfg.pop('type')
+        dataset = MultiImageMixDataset(**cp_cfg)
+    elif isinstance(cfg.get('img_dir'), (list, tuple)) or isinstance(
+            cfg.get('split', None), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     drop_last=False,
+                     pin_memory=True,
+                     persistent_workers=True,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        seed (int | None): Seed to be used. Default: None.
+        drop_last (bool): Whether to drop the last incomplete batch in epoch.
+            Default: False
+        pin_memory (bool): Whether to use pin_memory in DataLoader.
+            Default: True
+        persistent_workers (bool): If True, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once.
+            This allows to maintain the workers Dataset instances alive.
+            The argument also has effect in PyTorch>=1.7.0.
+            Default: True
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, seed=seed)
+        shuffle = False
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    if digit_version(torch.__version__) >= digit_version('1.8.0'):
+        data_loader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            sampler=sampler,
+            num_workers=num_workers,
+            collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+            pin_memory=pin_memory,
+            shuffle=shuffle,
+            worker_init_fn=init_fn,
+            drop_last=drop_last,
+            persistent_workers=persistent_workers,
+            **kwargs)
+    else:
+        data_loader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            sampler=sampler,
+            num_workers=num_workers,
+            collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+            pin_memory=pin_memory,
+            shuffle=shuffle,
+            worker_init_fn=init_fn,
+            drop_last=drop_last,
+            **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    """Worker init func for dataloader.
+
+    The seed of each worker equals to num_worker * rank + worker_id + user_seed
+
+    Args:
+        worker_id (int): Worker id.
+        num_workers (int): Number of workers.
+        rank (int): The rank of current process.
+        seed (int): The random seed to use.
+    """
+
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/chase_db1.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8e33b0f0283274da1a84d73eafc961865584ad
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/chase_db1.py
@@ -0,0 +1,40 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class ChaseDB1Dataset(CustomDataset):
+    """Chase_db1 dataset.
+
+    In segmentation map annotation for Chase_db1, 0 stands for background,
+    which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_1stHO.png'.
+    """
+
+    CLASSES = ('background', 'vessel')
+
+    PALETTE = [[120, 120, 120], [6, 230, 230]]
+
+    def __init__(self, **kwargs):
+        super(ChaseDB1Dataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='_1stHO.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/cityscapes.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..490b4c79e83000818d600927c8bb4433d8cb5d42
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/cityscapes.py
@@ -0,0 +1,227 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from PIL import Image
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CustomDataset):
+    """Cityscapes dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+
+    CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+               'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+               'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+               'bicycle')
+
+    PALETTE = [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+               [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+               [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+               [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100],
+               [0, 80, 100], [0, 0, 230], [119, 11, 32]]
+
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtFine_labelTrainIds.png',
+                 **kwargs):
+        super(CityscapesDataset, self).__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+    @staticmethod
+    def _convert_to_label_id(result):
+        """Convert trainId to id for cityscapes."""
+        if isinstance(result, str):
+            result = np.load(result)
+        import cityscapesscripts.helpers.labels as CSLabels
+        result_copy = result.copy()
+        for trainId, label in CSLabels.trainId2label.items():
+            result_copy[result == trainId] = label.id
+
+        return result_copy
+
+    def results2img(self, results, imgfile_prefix, to_label_id, indices=None):
+        """Write the segmentation results to images.
+
+        Args:
+            results (list[ndarray]): Testing results of the
+                dataset.
+            imgfile_prefix (str): The filename prefix of the png files.
+                If the prefix is "somepath/xxx",
+                the png files will be named "somepath/xxx.png".
+            to_label_id (bool): whether convert output to label_id for
+                submission.
+            indices (list[int], optional): Indices of input results,
+                if not set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            list[str: str]: result txt files which contains corresponding
+            semantic segmentation images.
+        """
+        if indices is None:
+            indices = list(range(len(self)))
+
+        mmcv.mkdir_or_exist(imgfile_prefix)
+        result_files = []
+        for result, idx in zip(results, indices):
+            if to_label_id:
+                result = self._convert_to_label_id(result)
+            filename = self.img_infos[idx]['filename']
+            basename = osp.splitext(osp.basename(filename))[0]
+
+            png_filename = osp.join(imgfile_prefix, f'{basename}.png')
+
+            output = Image.fromarray(result.astype(np.uint8)).convert('P')
+            import cityscapesscripts.helpers.labels as CSLabels
+            palette = np.zeros((len(CSLabels.id2label), 3), dtype=np.uint8)
+            for label_id, label in CSLabels.id2label.items():
+                palette[label_id] = label.color
+
+            output.putpalette(palette)
+            output.save(png_filename)
+            result_files.append(png_filename)
+
+        return result_files
+
+    def format_results(self,
+                       results,
+                       imgfile_prefix,
+                       to_label_id=True,
+                       indices=None):
+        """Format the results into dir (standard format for Cityscapes
+        evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            imgfile_prefix (str): The prefix of images files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix".
+            to_label_id (bool): whether convert output to label_id for
+                submission. Default: False
+            indices (list[int], optional): Indices of input results,
+                if not set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a list containing
+                the image paths, tmp_dir is the temporal directory created
+                for saving json/png files when img_prefix is not specified.
+        """
+        if indices is None:
+            indices = list(range(len(self)))
+
+        assert isinstance(results, list), 'results must be a list.'
+        assert isinstance(indices, list), 'indices must be a list.'
+
+        result_files = self.results2img(results, imgfile_prefix, to_label_id,
+                                        indices)
+
+        return result_files
+
+    def evaluate(self,
+                 results,
+                 metric='mIoU',
+                 logger=None,
+                 imgfile_prefix=None):
+        """Evaluation in Cityscapes/default protocol.
+
+        Args:
+            results (list): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Default: None.
+            imgfile_prefix (str | None): The prefix of output image file,
+                for cityscapes evaluation only. It includes the file path and
+                the prefix of filename, e.g., "a/b/prefix".
+                If results are evaluated with cityscapes protocol, it would be
+                the prefix of output png files. The output files would be
+                png images under folder "a/b/prefix/xxx.png", where "xxx" is
+                the image name of cityscapes. If not specified, a temp file
+                will be created for evaluation.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Cityscapes/default metrics.
+        """
+
+        eval_results = dict()
+        metrics = metric.copy() if isinstance(metric, list) else [metric]
+        if 'cityscapes' in metrics:
+            eval_results.update(
+                self._evaluate_cityscapes(results, logger, imgfile_prefix))
+            metrics.remove('cityscapes')
+        if len(metrics) > 0:
+            eval_results.update(
+                super(CityscapesDataset,
+                      self).evaluate(results, metrics, logger))
+
+        return eval_results
+
+    def _evaluate_cityscapes(self, results, logger, imgfile_prefix):
+        """Evaluation in Cityscapes protocol.
+
+        Args:
+            results (list): Testing results of the dataset.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            imgfile_prefix (str | None): The prefix of output image file
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results.
+        """
+        try:
+            import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval  # noqa
+        except ImportError:
+            raise ImportError('Please run "pip install cityscapesscripts" to '
+                              'install cityscapesscripts first.')
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        result_dir = imgfile_prefix
+
+        eval_results = dict()
+        print_log(f'Evaluating results under {result_dir} ...', logger=logger)
+
+        CSEval.args.evalInstLevelScore = True
+        CSEval.args.predictionPath = osp.abspath(result_dir)
+        CSEval.args.evalPixelAccuracy = True
+        CSEval.args.JSONOutput = False
+
+        seg_map_list = []
+        pred_list = []
+
+        # when evaluating with official cityscapesscripts,
+        # **_gtFine_labelIds.png is used
+        for seg_map in mmcv.scandir(
+                self.ann_dir, 'gtFine_labelIds.png', recursive=True):
+            seg_map_list.append(osp.join(self.ann_dir, seg_map))
+            pred_list.append(CSEval.getPrediction(CSEval.args, seg_map))
+
+        eval_results.update(
+            CSEval.evaluateImgLists(pred_list, seg_map_list, CSEval.args))
+
+        return eval_results
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/coco_stuff.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/coco_stuff.py
new file mode 100644
index 0000000000000000000000000000000000000000..9889496b3fb75b0207b2983063d9d9cb9c29658b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/coco_stuff.py
@@ -0,0 +1,107 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class COCOStuffDataset(CustomDataset):
+    """COCO-Stuff dataset.
+
+    In segmentation map annotation for COCO-Stuff, Train-IDs of the 10k version
+    are from 1 to 171, where 0 is the ignore index, and Train-ID of COCO Stuff
+    164k is from 0 to 170, where 255 is the ignore index. So, they are all 171
+    semantic categories. ``reduce_zero_label`` is set to True and False for the
+    10k and 164k versions, respectively. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    CLASSES = (
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
+        'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
+        'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
+        'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+        'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
+        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
+        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
+        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
+        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+        'window-blind', 'window-other', 'wood')
+
+    PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+               [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+               [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+               [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+               [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+               [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+               [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160],
+               [0, 32, 0], [0, 128, 128], [64, 128, 160], [128, 160, 0],
+               [0, 128, 0], [192, 128, 32], [128, 96, 128], [0, 0, 128],
+               [64, 0, 32], [0, 224, 128], [128, 0, 0], [192, 0, 160],
+               [0, 96, 128], [128, 128, 128], [64, 0, 160], [128, 224, 128],
+               [128, 128, 64], [192, 0, 32], [128, 96, 0], [128, 0, 192],
+               [0, 128, 32], [64, 224, 0], [0, 0, 64], [128, 128, 160],
+               [64, 96, 0], [0, 128, 192], [0, 128, 160], [192, 224, 0],
+               [0, 128, 64], [128, 128, 32], [192, 32, 128], [0, 64, 192],
+               [0, 0, 32], [64, 160, 128], [128, 64, 64], [128, 0, 160],
+               [64, 32, 128], [128, 192, 192], [0, 0, 160], [192, 160, 128],
+               [128, 192, 0], [128, 0, 96], [192, 32, 0], [128, 64, 128],
+               [64, 128, 96], [64, 160, 0], [0, 64, 0], [192, 128, 224],
+               [64, 32, 0], [0, 192, 128], [64, 128, 224], [192, 160, 0],
+               [0, 192, 0], [192, 128, 96], [192, 96, 128], [0, 64, 128],
+               [64, 0, 96], [64, 224, 128], [128, 64, 0], [192, 0, 224],
+               [64, 96, 128], [128, 192, 128], [64, 0, 224], [192, 224, 128],
+               [128, 192, 64], [192, 0, 96], [192, 96, 0], [128, 64, 192],
+               [0, 128, 96], [0, 224, 0], [64, 64, 64], [128, 128, 224],
+               [0, 96, 0], [64, 192, 192], [0, 128, 224], [128, 224, 0],
+               [64, 192, 64], [128, 128, 96], [128, 32, 128], [64, 0, 192],
+               [0, 64, 96], [0, 160, 128], [192, 0, 64], [128, 64, 224],
+               [0, 32, 128], [192, 128, 192], [0, 64, 224], [128, 160, 128],
+               [192, 128, 0], [128, 64, 32], [128, 32, 64], [192, 0, 128],
+               [64, 192, 32], [0, 160, 64], [64, 0, 0], [192, 192, 160],
+               [0, 32, 64], [64, 128, 128], [64, 192, 160], [128, 160, 64],
+               [64, 128, 0], [192, 192, 32], [128, 96, 192], [64, 0, 128],
+               [64, 64, 32], [0, 224, 192], [192, 0, 0], [192, 64, 160],
+               [0, 96, 192], [192, 128, 128], [64, 64, 160], [128, 224, 192],
+               [192, 128, 64], [192, 64, 32], [128, 96, 64], [192, 0, 192],
+               [0, 192, 32], [64, 224, 64], [64, 0, 64], [128, 192, 160],
+               [64, 96, 64], [64, 128, 192], [0, 192, 160], [192, 224, 64],
+               [64, 128, 64], [128, 192, 32], [192, 32, 192], [64, 64, 192],
+               [0, 64, 32], [64, 160, 192], [192, 64, 64], [128, 64, 160],
+               [64, 32, 192], [192, 192, 192], [0, 64, 160], [192, 160, 192],
+               [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
+               [64, 192, 96], [64, 160, 64], [64, 64, 0]]
+
+    def __init__(self, **kwargs):
+        super(COCOStuffDataset, self).__init__(
+            img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/custom.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a2641fa739357b18672579e641eeccde3d2fb1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/custom.py
@@ -0,0 +1,500 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from prettytable import PrettyTable
+from torch.utils.data import Dataset
+
+from mmseg.core import eval_metrics, intersect_and_union, pre_eval_to_metrics
+from mmseg.utils import get_root_logger
+from .builder import DATASETS
+from .pipelines import Compose, LoadAnnotations
+
+
+@DATASETS.register_module()
+class CustomDataset(Dataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of CustomDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        pipeline (list[dict]): Processing pipeline
+        img_dir (str): Path to image directory
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        ann_dir (str, optional): Path to annotation directory. Default: None
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        split (str, optional): Split txt file. If split is specified, only
+            file with suffix in the splits will be loaded. Otherwise, all
+            images in img_dir/ann_dir will be loaded. Default: None
+        data_root (str, optional): Data root for img_dir/ann_dir. Default:
+            None.
+        test_mode (bool): If test_mode=True, gt wouldn't be loaded.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default: False
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Default: None.
+        palette (Sequence[Sequence[int]]] | np.ndarray | None):
+            The palette of segmentation map. If None is given, and
+            self.PALETTE is None, random palette will be generated.
+            Default: None
+        gt_seg_map_loader_cfg (dict, optional): build LoadAnnotations to
+            load gt for evaluation, load from disk by default. Default: None.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    CLASSES = None
+
+    PALETTE = None
+
+    def __init__(self,
+                 pipeline,
+                 img_dir,
+                 img_suffix='.jpg',
+                 ann_dir=None,
+                 seg_map_suffix='.png',
+                 split=None,
+                 data_root=None,
+                 test_mode=False,
+                 ignore_index=255,
+                 reduce_zero_label=False,
+                 classes=None,
+                 palette=None,
+                 gt_seg_map_loader_cfg=None,
+                 file_client_args=dict(backend='disk')):
+        self.pipeline = Compose(pipeline)
+        self.img_dir = img_dir
+        self.img_suffix = img_suffix
+        self.ann_dir = ann_dir
+        self.seg_map_suffix = seg_map_suffix
+        self.split = split
+        self.data_root = data_root
+        self.test_mode = test_mode
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.label_map = None
+        self.CLASSES, self.PALETTE = self.get_classes_and_palette(
+            classes, palette)
+        self.gt_seg_map_loader = LoadAnnotations(
+        ) if gt_seg_map_loader_cfg is None else LoadAnnotations(
+            **gt_seg_map_loader_cfg)
+
+        self.file_client_args = file_client_args
+        self.file_client = mmcv.FileClient.infer_client(self.file_client_args)
+
+        if test_mode:
+            assert self.CLASSES is not None, \
+                '`cls.CLASSES` or `classes` should be specified when testing'
+
+        # join paths if data_root is specified
+        if self.data_root is not None:
+            if not osp.isabs(self.img_dir):
+                self.img_dir = osp.join(self.data_root, self.img_dir)
+            if not (self.ann_dir is None or osp.isabs(self.ann_dir)):
+                self.ann_dir = osp.join(self.data_root, self.ann_dir)
+            if not (self.split is None or osp.isabs(self.split)):
+                self.split = osp.join(self.data_root, self.split)
+
+        # load annotations
+        self.img_infos = self.load_annotations(self.img_dir, self.img_suffix,
+                                               self.ann_dir,
+                                               self.seg_map_suffix, self.split)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.img_infos)
+
+    def load_annotations(self, img_dir, img_suffix, ann_dir, seg_map_suffix,
+                         split):
+        """Load annotation from directory.
+
+        Args:
+            img_dir (str): Path to image directory
+            img_suffix (str): Suffix of images.
+            ann_dir (str|None): Path to annotation directory.
+            seg_map_suffix (str|None): Suffix of segmentation maps.
+            split (str|None): Split txt file. If split is specified, only file
+                with suffix in the splits will be loaded. Otherwise, all images
+                in img_dir/ann_dir will be loaded. Default: None
+
+        Returns:
+            list[dict]: All image info of dataset.
+        """
+
+        img_infos = []
+        if split is not None:
+            lines = mmcv.list_from_file(
+                split, file_client_args=self.file_client_args)
+            for line in lines:
+                img_name = line.strip()
+                img_info = dict(filename=img_name + img_suffix)
+                if ann_dir is not None:
+                    seg_map = img_name + seg_map_suffix
+                    img_info['ann'] = dict(seg_map=seg_map)
+                img_infos.append(img_info)
+        else:
+            for img in self.file_client.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=img_suffix,
+                    recursive=True):
+                img_info = dict(filename=img)
+                if ann_dir is not None:
+                    seg_map = img.replace(img_suffix, seg_map_suffix)
+                    img_info['ann'] = dict(seg_map=seg_map)
+                img_infos.append(img_info)
+            img_infos = sorted(img_infos, key=lambda x: x['filename'])
+
+        print_log(f'Loaded {len(img_infos)} images', logger=get_root_logger())
+        return img_infos
+
+    def get_ann_info(self, idx):
+        """Get annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        return self.img_infos[idx]['ann']
+
+    def pre_pipeline(self, results):
+        """Prepare results dict for pipeline."""
+        results['seg_fields'] = []
+        results['img_prefix'] = self.img_dir
+        results['seg_prefix'] = self.ann_dir
+        if self.custom_classes:
+            results['label_map'] = self.label_map
+
+    def __getitem__(self, idx):
+        """Get training/test data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training/test data (with annotation if `test_mode` is set
+                False).
+        """
+
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        else:
+            return self.prepare_train_img(idx)
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training data and annotation after pipeline with new keys
+                introduced by pipeline.
+        """
+
+        img_info = self.img_infos[idx]
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Testing data after pipeline with new keys introduced by
+                pipeline.
+        """
+
+        img_info = self.img_infos[idx]
+        results = dict(img_info=img_info)
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def format_results(self, results, imgfile_prefix, indices=None, **kwargs):
+        """Place holder to format result to dataset specific output."""
+        raise NotImplementedError
+
+    def get_gt_seg_map_by_idx(self, index):
+        """Get one ground truth segmentation map for evaluation."""
+        ann_info = self.get_ann_info(index)
+        results = dict(ann_info=ann_info)
+        self.pre_pipeline(results)
+        self.gt_seg_map_loader(results)
+        return results['gt_semantic_seg']
+
+    def get_gt_seg_maps(self, efficient_test=None):
+        """Get ground truth segmentation maps for evaluation."""
+        if efficient_test is not None:
+            warnings.warn(
+                'DeprecationWarning: ``efficient_test`` has been deprecated '
+                'since MMSeg v0.16, the ``get_gt_seg_maps()`` is CPU memory '
+                'friendly by default. ')
+
+        for idx in range(len(self)):
+            ann_info = self.get_ann_info(idx)
+            results = dict(ann_info=ann_info)
+            self.pre_pipeline(results)
+            self.gt_seg_map_loader(results)
+            yield results['gt_semantic_seg']
+
+    def pre_eval(self, preds, indices):
+        """Collect eval result from each iteration.
+
+        Args:
+            preds (list[torch.Tensor] | torch.Tensor): the segmentation logit
+                after argmax, shape (N, H, W).
+            indices (list[int] | int): the prediction related ground truth
+                indices.
+
+        Returns:
+            list[torch.Tensor]: (area_intersect, area_union, area_prediction,
+                area_ground_truth).
+        """
+        # In order to compat with batch inference
+        if not isinstance(indices, list):
+            indices = [indices]
+        if not isinstance(preds, list):
+            preds = [preds]
+
+        pre_eval_results = []
+
+        for pred, index in zip(preds, indices):
+            seg_map = self.get_gt_seg_map_by_idx(index)
+            pre_eval_results.append(
+                intersect_and_union(
+                    pred,
+                    seg_map,
+                    len(self.CLASSES),
+                    self.ignore_index,
+                    # as the labels has been converted when dataset initialized
+                    # in `get_palette_for_custom_classes ` this `label_map`
+                    # should be `dict()`, see
+                    # https://github.com/open-mmlab/mmsegmentation/issues/1415
+                    # for more ditails
+                    label_map=dict(),
+                    reduce_zero_label=self.reduce_zero_label))
+
+        return pre_eval_results
+
+    def get_classes_and_palette(self, classes=None, palette=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+            palette (Sequence[Sequence[int]]] | np.ndarray | None):
+                The palette of segmentation map. If None is given, random
+                palette will be generated. Default: None
+        """
+        if classes is None:
+            self.custom_classes = False
+            return self.CLASSES, self.PALETTE
+
+        self.custom_classes = True
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        if self.CLASSES:
+            if not set(class_names).issubset(self.CLASSES):
+                raise ValueError('classes is not a subset of CLASSES.')
+
+            # dictionary, its keys are the old label ids and its values
+            # are the new label ids.
+            # used for changing pixel labels in load_annotations.
+            self.label_map = {}
+            for i, c in enumerate(self.CLASSES):
+                if c not in class_names:
+                    self.label_map[i] = -1
+                else:
+                    self.label_map[i] = class_names.index(c)
+
+        palette = self.get_palette_for_custom_classes(class_names, palette)
+
+        return class_names, palette
+
+    def get_palette_for_custom_classes(self, class_names, palette=None):
+
+        if self.label_map is not None:
+            # return subset of palette
+            palette = []
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != -1:
+                    palette.append(self.PALETTE[old_id])
+            palette = type(self.PALETTE)(palette)
+
+        elif palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed, and restore
+                # random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(0, 255, size=(len(class_names), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+
+        return palette
+
+    def evaluate(self,
+                 results,
+                 metric='mIoU',
+                 logger=None,
+                 gt_seg_maps=None,
+                 **kwargs):
+        """Evaluate the dataset.
+
+        Args:
+            results (list[tuple[torch.Tensor]] | list[str]): per image pre_eval
+                 results or predict segmentation map for computing evaluation
+                 metric.
+            metric (str | list[str]): Metrics to be evaluated. 'mIoU',
+                'mDice' and 'mFscore' are supported.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Default: None.
+            gt_seg_maps (generator[ndarray]): Custom gt seg maps as input,
+                used in ConcatDataset
+
+        Returns:
+            dict[str, float]: Default metrics.
+        """
+        if isinstance(metric, str):
+            metric = [metric]
+        allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+        if not set(metric).issubset(set(allowed_metrics)):
+            raise KeyError('metric {} is not supported'.format(metric))
+
+        eval_results = {}
+        # test a list of files
+        if mmcv.is_list_of(results, np.ndarray) or mmcv.is_list_of(
+                results, str):
+            if gt_seg_maps is None:
+                gt_seg_maps = self.get_gt_seg_maps()
+            num_classes = len(self.CLASSES)
+            ret_metrics = eval_metrics(
+                results,
+                gt_seg_maps,
+                num_classes,
+                self.ignore_index,
+                metric,
+                label_map=dict(),
+                reduce_zero_label=self.reduce_zero_label)
+        # test a list of pre_eval_results
+        else:
+            ret_metrics = pre_eval_to_metrics(results, metric)
+
+        # Because dataset.CLASSES is required for per-eval.
+        if self.CLASSES is None:
+            class_names = tuple(range(num_classes))
+        else:
+            class_names = self.CLASSES
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+
+        # each class table
+        ret_metrics.pop('aAcc', None)
+        ret_metrics_class = OrderedDict({
+            ret_metric: np.round(ret_metric_value * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+
+        # for logger
+        class_table_data = PrettyTable()
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+
+        summary_table_data = PrettyTable()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                summary_table_data.add_column(key, [val])
+            else:
+                summary_table_data.add_column('m' + key, [val])
+
+        print_log('per class results:', logger)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+        print_log('Summary:', logger)
+        print_log('\n' + summary_table_data.get_string(), logger=logger)
+
+        # each metric dict
+        for key, value in ret_metrics_summary.items():
+            if key == 'aAcc':
+                eval_results[key] = value / 100.0
+            else:
+                eval_results['m' + key] = value / 100.0
+
+        ret_metrics_class.pop('Class', None)
+        for key, value in ret_metrics_class.items():
+            eval_results.update({
+                key + '.' + str(name): value[idx] / 100.0
+                for idx, name in enumerate(class_names)
+            })
+
+        return eval_results
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dark_zurich.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dark_zurich.py
new file mode 100644
index 0000000000000000000000000000000000000000..601a96ea3fb96e016e8890d90ced3615fb79b362
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dark_zurich.py
@@ -0,0 +1,27 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class DarkZurichDataset(CityscapesDataset):
+    """DarkZurichDataset dataset."""
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            img_suffix='_rgb_anon.png',
+            seg_map_suffix='_gt_labelTrainIds.png',
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dataset_wrappers.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa53d4bf188eedb677738f050fb9883800d677c8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/dataset_wrappers.py
@@ -0,0 +1,290 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import bisect
+import collections
+import copy
+from itertools import chain
+
+import mmcv
+import numpy as np
+from mmcv.utils import build_from_cfg, print_log
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .builder import DATASETS, PIPELINES
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    support evaluation and formatting results
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+        separate_eval (bool): Whether to evaluate the concatenated
+            dataset results separately, Defaults to True.
+    """
+
+    def __init__(self, datasets, separate_eval=True):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        self.PALETTE = datasets[0].PALETTE
+        self.separate_eval = separate_eval
+        assert separate_eval in [True, False], \
+            f'separate_eval can only be True or False,' \
+            f'but get {separate_eval}'
+        if any([isinstance(ds, CityscapesDataset) for ds in datasets]):
+            raise NotImplementedError(
+                'Evaluating ConcatDataset containing CityscapesDataset'
+                'is not supported!')
+
+    def evaluate(self, results, logger=None, **kwargs):
+        """Evaluate the results.
+
+        Args:
+            results (list[tuple[torch.Tensor]] | list[str]]): per image
+                pre_eval results or predict segmentation map for
+                computing evaluation metric.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: evaluate results of the total dataset
+                or each separate
+            dataset if `self.separate_eval=True`.
+        """
+        assert len(results) == self.cumulative_sizes[-1], \
+            ('Dataset and results have different sizes: '
+             f'{self.cumulative_sizes[-1]} v.s. {len(results)}')
+
+        # Check whether all the datasets support evaluation
+        for dataset in self.datasets:
+            assert hasattr(dataset, 'evaluate'), \
+                f'{type(dataset)} does not implement evaluate function'
+
+        if self.separate_eval:
+            dataset_idx = -1
+            total_eval_results = dict()
+            for size, dataset in zip(self.cumulative_sizes, self.datasets):
+                start_idx = 0 if dataset_idx == -1 else \
+                    self.cumulative_sizes[dataset_idx]
+                end_idx = self.cumulative_sizes[dataset_idx + 1]
+
+                results_per_dataset = results[start_idx:end_idx]
+                print_log(
+                    f'\nEvaluateing {dataset.img_dir} with '
+                    f'{len(results_per_dataset)} images now',
+                    logger=logger)
+
+                eval_results_per_dataset = dataset.evaluate(
+                    results_per_dataset, logger=logger, **kwargs)
+                dataset_idx += 1
+                for k, v in eval_results_per_dataset.items():
+                    total_eval_results.update({f'{dataset_idx}_{k}': v})
+
+            return total_eval_results
+
+        if len(set([type(ds) for ds in self.datasets])) != 1:
+            raise NotImplementedError(
+                'All the datasets should have same types when '
+                'self.separate_eval=False')
+        else:
+            if mmcv.is_list_of(results, np.ndarray) or mmcv.is_list_of(
+                    results, str):
+                # merge the generators of gt_seg_maps
+                gt_seg_maps = chain(
+                    *[dataset.get_gt_seg_maps() for dataset in self.datasets])
+            else:
+                # if the results are `pre_eval` results,
+                # we do not need gt_seg_maps to evaluate
+                gt_seg_maps = None
+            eval_results = self.datasets[0].evaluate(
+                results, gt_seg_maps=gt_seg_maps, logger=logger, **kwargs)
+            return eval_results
+
+    def get_dataset_idx_and_sample_idx(self, indice):
+        """Return dataset and sample index when given an indice of
+        ConcatDataset.
+
+        Args:
+            indice (int): indice of sample in ConcatDataset
+
+        Returns:
+            int: the index of sub dataset the sample belong to
+            int: the index of sample in its corresponding subset
+        """
+        if indice < 0:
+            if -indice > len(self):
+                raise ValueError(
+                    'absolute value of index should not exceed dataset length')
+            indice = len(self) + indice
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, indice)
+        if dataset_idx == 0:
+            sample_idx = indice
+        else:
+            sample_idx = indice - self.cumulative_sizes[dataset_idx - 1]
+        return dataset_idx, sample_idx
+
+    def format_results(self, results, imgfile_prefix, indices=None, **kwargs):
+        """format result for every sample of ConcatDataset."""
+        if indices is None:
+            indices = list(range(len(self)))
+
+        assert isinstance(results, list), 'results must be a list.'
+        assert isinstance(indices, list), 'indices must be a list.'
+
+        ret_res = []
+        for i, indice in enumerate(indices):
+            dataset_idx, sample_idx = self.get_dataset_idx_and_sample_idx(
+                indice)
+            res = self.datasets[dataset_idx].format_results(
+                [results[i]],
+                imgfile_prefix + f'/{dataset_idx}',
+                indices=[sample_idx],
+                **kwargs)
+            ret_res.append(res)
+        return sum(ret_res, [])
+
+    def pre_eval(self, preds, indices):
+        """do pre eval for every sample of ConcatDataset."""
+        # In order to compat with batch inference
+        if not isinstance(indices, list):
+            indices = [indices]
+        if not isinstance(preds, list):
+            preds = [preds]
+        ret_res = []
+        for i, indice in enumerate(indices):
+            dataset_idx, sample_idx = self.get_dataset_idx_and_sample_idx(
+                indice)
+            res = self.datasets[dataset_idx].pre_eval(preds[i], sample_idx)
+            ret_res.append(res)
+        return sum(ret_res, [])
+
+
+@DATASETS.register_module()
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        self.PALETTE = dataset.PALETTE
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        """Get item from original dataset."""
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        """The length is multiplied by ``times``"""
+        return self.times * self._ori_len
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process.
+
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+    """
+
+    def __init__(self, dataset, pipeline, skip_type_keys=None):
+        assert isinstance(pipeline, collections.abc.Sequence)
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = build_from_cfg(transform, PIPELINES)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self.dataset = dataset
+        self.CLASSES = dataset.CLASSES
+        self.PALETTE = dataset.PALETTE
+        self.num_samples = len(dataset)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indexes'):
+                indexes = transform.get_indexes(self.dataset)
+                if not isinstance(indexes, collections.abc.Sequence):
+                    indexes = [indexes]
+                mix_results = [
+                    copy.deepcopy(self.dataset[index]) for index in indexes
+                ]
+                results['mix_results'] = mix_results
+
+            results = transform(results)
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys.
+
+        It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/drive.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..db2ab7a65db887ed24e55687adb11615a2e8ff14
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/drive.py
@@ -0,0 +1,40 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class DRIVEDataset(CustomDataset):
+    """DRIVE dataset.
+
+    In segmentation map annotation for DRIVE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+
+    CLASSES = ('background', 'vessel')
+
+    PALETTE = [[120, 120, 120], [6, 230, 230]]
+
+    def __init__(self, **kwargs):
+        super(DRIVEDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='_manual1.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/hrf.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..192db90534bbbee0821f97766dc9e48b8437da52
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/hrf.py
@@ -0,0 +1,40 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class HRFDataset(CustomDataset):
+    """HRF dataset.
+
+    In segmentation map annotation for HRF, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+
+    CLASSES = ('background', 'vessel')
+
+    PALETTE = [[120, 120, 120], [6, 230, 230]]
+
+    def __init__(self, **kwargs):
+        super(HRFDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isaid.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39ce53d103c9c72536c376565bec650c98d51fa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isaid.py
@@ -0,0 +1,95 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mmcv
+from mmcv.utils import print_log
+
+from ..utils import get_root_logger
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(CustomDataset):
+    """ iSAID: A Large-scale Dataset for Instance Segmentation in Aerial Images
+    In segmentation map annotation for iSAID dataset, which is included
+    in 16 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+
+    CLASSES = ('background', 'ship', 'store_tank', 'baseball_diamond',
+               'tennis_court', 'basketball_court', 'Ground_Track_Field',
+               'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+               'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+               'Harbor')
+
+    PALETTE = [[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+               [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127, 127],
+               [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
+               [0, 127, 191], [0, 127, 255], [0, 100, 155]]
+
+    def __init__(self, **kwargs):
+        super(iSAIDDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            ignore_index=255,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir)
+
+    def load_annotations(self,
+                         img_dir,
+                         img_suffix,
+                         ann_dir,
+                         seg_map_suffix=None,
+                         split=None):
+        """Load annotation from directory.
+
+        Args:
+            img_dir (str): Path to image directory
+            img_suffix (str): Suffix of images.
+            ann_dir (str|None): Path to annotation directory.
+            seg_map_suffix (str|None): Suffix of segmentation maps.
+            split (str|None): Split txt file. If split is specified, only file
+                with suffix in the splits will be loaded. Otherwise, all images
+                in img_dir/ann_dir will be loaded. Default: None
+
+        Returns:
+            list[dict]: All image info of dataset.
+        """
+
+        img_infos = []
+        if split is not None:
+            with open(split) as f:
+                for line in f:
+                    name = line.strip()
+                    img_info = dict(filename=name + img_suffix)
+                    if ann_dir is not None:
+                        ann_name = name + '_instance_color_RGB'
+                        seg_map = ann_name + seg_map_suffix
+                        img_info['ann'] = dict(seg_map=seg_map)
+                    img_infos.append(img_info)
+        else:
+            for img in mmcv.scandir(img_dir, img_suffix, recursive=True):
+                img_info = dict(filename=img)
+                if ann_dir is not None:
+                    seg_img = img
+                    seg_map = seg_img.replace(
+                        img_suffix, '_instance_color_RGB' + seg_map_suffix)
+                    img_info['ann'] = dict(seg_map=seg_map)
+                img_infos.append(img_info)
+
+        print_log(f'Loaded {len(img_infos)} images', logger=get_root_logger())
+        return img_infos
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isprs.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isprs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e8cf55274e2d59ca7a0709cf36a847fb563f511
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/isprs.py
@@ -0,0 +1,38 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class ISPRSDataset(CustomDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for LoveDA, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    CLASSES = ('impervious_surface', 'building', 'low_vegetation', 'tree',
+               'car', 'clutter')
+
+    PALETTE = [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+               [255, 255, 0], [255, 0, 0]]
+
+    def __init__(self, **kwargs):
+        super(ISPRSDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=True,
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/loveda.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..614d1068f7c70167d32753e8a4bef3d8e9a246a1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/loveda.py
@@ -0,0 +1,105 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class LoveDADataset(CustomDataset):
+    """LoveDA dataset.
+
+    In segmentation map annotation for LoveDA, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    CLASSES = ('background', 'building', 'road', 'water', 'barren', 'forest',
+               'agricultural')
+
+    PALETTE = [[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+               [159, 129, 183], [0, 255, 0], [255, 195, 128]]
+
+    def __init__(self, **kwargs):
+        super(LoveDADataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=True,
+            **kwargs)
+
+    def results2img(self, results, imgfile_prefix, indices=None):
+        """Write the segmentation results to images.
+
+        Args:
+            results (list[ndarray]): Testing results of the
+                dataset.
+            imgfile_prefix (str): The filename prefix of the png files.
+                If the prefix is "somepath/xxx",
+                the png files will be named "somepath/xxx.png".
+            indices (list[int], optional): Indices of input results, if not
+                set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            list[str: str]: result txt files which contains corresponding
+            semantic segmentation images.
+        """
+
+        mmcv.mkdir_or_exist(imgfile_prefix)
+        result_files = []
+        for result, idx in zip(results, indices):
+
+            filename = self.img_infos[idx]['filename']
+            basename = osp.splitext(osp.basename(filename))[0]
+
+            png_filename = osp.join(imgfile_prefix, f'{basename}.png')
+
+            # The  index range of official requirement is from 0 to 6.
+            output = Image.fromarray(result.astype(np.uint8))
+            output.save(png_filename)
+            result_files.append(png_filename)
+
+        return result_files
+
+    def format_results(self, results, imgfile_prefix, indices=None):
+        """Format the results into dir (standard format for LoveDA evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            imgfile_prefix (str): The prefix of images files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix".
+            indices (list[int], optional): Indices of input results,
+                if not set, all the indices of the dataset will be used.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a list containing
+                the image paths, tmp_dir is the temporal directory created
+                for saving json/png files when img_prefix is not specified.
+        """
+        if indices is None:
+            indices = list(range(len(self)))
+
+        assert isinstance(results, list), 'results must be a list.'
+        assert isinstance(indices, list), 'indices must be a list.'
+
+        result_files = self.results2img(results, imgfile_prefix, indices)
+
+        return result_files
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/night_driving.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/night_driving.py
new file mode 100644
index 0000000000000000000000000000000000000000..3205074eb10f56e54869a76ab639565450f8a05d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/night_driving.py
@@ -0,0 +1,27 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class NightDrivingDataset(CityscapesDataset):
+    """NightDrivingDataset dataset."""
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            img_suffix='_leftImg8bit.png',
+            seg_map_suffix='_gtCoarse_labelTrainIds.png',
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pascal_context.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b936a28944b2d77dd438e08314f3f75364e763
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pascal_context.py
@@ -0,0 +1,116 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class PascalContextDataset(CustomDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+
+    Args:
+        split (str): Split txt file for PascalContext.
+    """
+
+    CLASSES = ('background', 'aeroplane', 'bag', 'bed', 'bedclothes', 'bench',
+               'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus',
+               'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth',
+               'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence',
+               'floor', 'flower', 'food', 'grass', 'ground', 'horse',
+               'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person',
+               'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep',
+               'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table',
+               'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water',
+               'window', 'wood')
+
+    PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+               [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+               [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+               [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+               [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+               [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+               [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+               [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+               [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+               [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+               [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+               [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+               [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+               [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+               [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+
+    def __init__(self, split, **kwargs):
+        super(PascalContextDataset, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=False,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir) and self.split is not None
+
+
+@DATASETS.register_module()
+class PascalContextDataset59(CustomDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+
+    Args:
+        split (str): Split txt file for PascalContext.
+    """
+
+    CLASSES = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle',
+               'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet',
+               'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow',
+               'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower',
+               'food', 'grass', 'ground', 'horse', 'keyboard', 'light',
+               'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform',
+               'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk',
+               'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train',
+               'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood')
+
+    PALETTE = [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+               [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230],
+               [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61],
+               [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140],
+               [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200],
+               [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71],
+               [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92],
+               [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6],
+               [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8],
+               [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8],
+               [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255],
+               [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140],
+               [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0],
+               [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0],
+               [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+
+    def __init__(self, split, **kwargs):
+        super(PascalContextDataset59, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=True,
+            **kwargs)
+        assert self.file_client.exists(self.img_dir) and self.split is not None
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cff64d3e9e5899bf95d9c5ad34b6ae7905958f1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/__init__.py
@@ -0,0 +1,32 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .compose import Compose
+from .formatting import (Collect, ImageToTensor, ToDataContainer, ToTensor,
+                         Transpose, to_tensor)
+from .loading import LoadAnnotations, LoadImageFromFile
+from .test_time_aug import MultiScaleFlipAug
+from .transforms import (CLAHE, AdjustGamma, Normalize, Pad,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomFlip, RandomMosaic, RandomRotate, Rerange,
+                         Resize, RGB2Gray, SegRescale)
+
+__all__ = [
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+    'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile',
+    'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop',
+    'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate',
+    'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray', 'RandomCutOut',
+    'RandomMosaic'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/compose.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/compose.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b2fbb5501d23baefe8ea12731298b13492f2a41
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/compose.py
@@ -0,0 +1,65 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose(object):
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formating.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f6c096a100e02e3f6077e05f44f2a3b8ff4a14f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formating.py
@@ -0,0 +1,22 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+import warnings
+
+from .formatting import *
+
+warnings.warn('DeprecationWarning: mmseg.datasets.pipelines.formating will be '
+              'deprecated in 2021, please replace it with '
+              'mmseg.datasets.pipelines.formatting.')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formatting.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..94356cad35e7beddb943c3e8d35052a5275886e8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/formatting.py
@@ -0,0 +1,302 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor(object):
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor(object):
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
+        """
+
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img.transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose(object):
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
+        """
+
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer(object):
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True),
+            dict(key='gt_semantic_seg'))``.
+    """
+
+    def __init__(self,
+                 fields=(dict(key='img',
+                              stack=True), dict(key='gt_semantic_seg'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img"
+    and "gt_semantic_seg". These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        if 'gt_semantic_seg' in results:
+            # convert to long
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None,
+                                                     ...].astype(np.int64)),
+                stack=True)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "gt_semantic_seg".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - "img_shape": shape of the image input to the network as a tuple
+            (h, w, c).  Note that images may be zero padded on the bottom/right
+            if the batch tensor is larger than this shape.
+
+        - "scale_factor": a float indicating the preprocessing scale
+
+        - "flip": a boolean indicating if image flip transform was used
+
+        - "filename": path to the image file
+
+        - "ori_shape": original shape of the image as a tuple (h, w, c)
+
+        - "pad_shape": image shape after padding
+
+        - "img_norm_cfg": a dict of normalization information:
+            - mean - per channel mean subtraction
+            - std - per channel std divisor
+            - to_rgb - bool indicating if bgr was converted to rgb
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: (``filename``, ``ori_filename``, ``ori_shape``,
+            ``img_shape``, ``pad_shape``, ``scale_factor``, ``flip``,
+            ``flip_direction``, ``img_norm_cfg``)
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:mmcv.DataContainer.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in``self.keys``
+                - ``img_metas``
+        """
+
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(keys={self.keys}, meta_keys={self.meta_keys})'
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/loading.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f643adef4a51ddbc828ddb7888b198527da1b6f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/loading.py
@@ -0,0 +1,171 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+import mmcv
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile(object):
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+        imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default:
+            'cv2'
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 file_client_args=dict(backend='disk'),
+                 imdecode_backend='cv2'):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.imdecode_backend = imdecode_backend
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmseg.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results.get('img_prefix') is not None:
+            filename = osp.join(results['img_prefix'],
+                                results['img_info']['filename'])
+        else:
+            filename = results['img_info']['filename']
+        img_bytes = self.file_client.get(filename)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, backend=self.imdecode_backend)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32},'
+        repr_str += f"color_type='{self.color_type}',"
+        repr_str += f"imdecode_backend='{self.imdecode_backend}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations(object):
+    """Load annotations for semantic segmentation.
+
+    Args:
+        reduce_zero_label (bool): Whether reduce all label value by 1.
+            Usually used for datasets where 0 is background label.
+            Default: False.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+        imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default:
+            'pillow'
+    """
+
+    def __init__(self,
+                 reduce_zero_label=False,
+                 file_client_args=dict(backend='disk'),
+                 imdecode_backend='pillow'):
+        self.reduce_zero_label = reduce_zero_label
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.imdecode_backend = imdecode_backend
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmseg.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results.get('seg_prefix', None) is not None:
+            filename = osp.join(results['seg_prefix'],
+                                results['ann_info']['seg_map'])
+        else:
+            filename = results['ann_info']['seg_map']
+        img_bytes = self.file_client.get(filename)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze().astype(np.uint8)
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        # reduce zero_label
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = 255
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == 254] = 255
+        results['gt_semantic_seg'] = gt_semantic_seg
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label},'
+        repr_str += f"imdecode_backend='{self.imdecode_backend}')"
+        return repr_str
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/test_time_aug.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/test_time_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..f552cf1401405569aa9e0bdeeb67321bc5d52dc2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,155 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import mmcv
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug(object):
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        img_scale=(2048, 1024),
+        img_ratios=[0.5, 1.0],
+        flip=True,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ]
+
+    After MultiScaleFLipAug with above configuration, the results are wrapped
+    into lists of the same length as followed:
+
+    .. code-block::
+
+        dict(
+            img=[...],
+            img_shape=[...],
+            scale=[(1024, 512), (1024, 512), (2048, 1024), (2048, 1024)]
+            flip=[False, True, False, True]
+            ...
+        )
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (None | tuple | list[tuple]): Images scales for resizing.
+        img_ratios (float | list[float]): Image ratios for resizing
+        flip (bool): Whether apply flip augmentation. Default: False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal" and "vertical". If flip_direction is list,
+            multiple flip augmentations will be applied.
+            It has no effect when flip == False. Default: "horizontal".
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale,
+                 img_ratios=None,
+                 flip=False,
+                 flip_direction='horizontal'):
+        if flip:
+            trans_index = {
+                key['type']: index
+                for index, key in enumerate(transforms)
+            }
+            if 'RandomFlip' in trans_index and 'Pad' in trans_index:
+                assert trans_index['RandomFlip'] < trans_index['Pad'], \
+                    'Pad must be executed after RandomFlip when flip is True'
+        self.transforms = Compose(transforms)
+        if img_ratios is not None:
+            img_ratios = img_ratios if isinstance(img_ratios,
+                                                  list) else [img_ratios]
+            assert mmcv.is_list_of(img_ratios, float)
+        if img_scale is None:
+            # mode 1: given img_scale=None and a range of image ratio
+            self.img_scale = None
+            assert mmcv.is_list_of(img_ratios, float)
+        elif isinstance(img_scale, tuple) and mmcv.is_list_of(
+                img_ratios, float):
+            assert len(img_scale) == 2
+            # mode 2: given a scale and a range of image ratio
+            self.img_scale = [(int(img_scale[0] * ratio),
+                               int(img_scale[1] * ratio))
+                              for ratio in img_ratios]
+        else:
+            # mode 3: given multiple scales
+            self.img_scale = img_scale if isinstance(img_scale,
+                                                     list) else [img_scale]
+        assert mmcv.is_list_of(self.img_scale, tuple) or self.img_scale is None
+        self.flip = flip
+        self.img_ratios = img_ratios
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip
+                and not any([t['type'] == 'RandomFlip' for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict[str: list]: The augmented data, where each value is wrapped
+               into a list.
+        """
+
+        aug_data = []
+        if self.img_scale is None and mmcv.is_list_of(self.img_ratios, float):
+            h, w = results['img'].shape[:2]
+            img_scale = [(int(w * ratio), int(h * ratio))
+                         for ratio in self.img_ratios]
+        else:
+            img_scale = self.img_scale
+        flip_aug = [False, True] if self.flip else [False]
+        for scale in img_scale:
+            for flip in flip_aug:
+                for direction in self.flip_direction:
+                    _results = results.copy()
+                    _results['scale'] = scale
+                    _results['flip'] = flip
+                    _results['flip_direction'] = direction
+                    data = self.transforms(_results)
+                    aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip})'
+        repr_str += f'flip_direction={self.flip_direction}'
+        return repr_str
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/transforms.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..570a3b8087a677cbf9dbb6ab625554a69714f5a6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/pipelines/transforms.py
@@ -0,0 +1,1348 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import mmcv
+import numpy as np
+from mmcv.utils import deprecated_api_warning, is_tuple_of
+from numpy import random
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class ResizeToMultiple(object):
+    """Resize images & seg to multiple of divisor.
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Resize(object):
+    """Resize images & seg.
+
+    This transform resizes the input image to some scale. If the input dict
+    contains the key "scale", then the scale in the input dict is used,
+    otherwise the specified scale in the init method is used.
+
+    ``img_scale`` can be None, a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 4 multiscale modes:
+
+    - ``ratio_range is not None``:
+    1. When img_scale is None, img_scale is the shape of image in results
+        (img_scale = results['img'].shape[:2]) and the image is resized based
+        on the original size. (mode 1)
+    2. When img_scale is a tuple (single-scale), randomly sample a ratio from
+        the ratio range and multiply it with the image scale. (mode 2)
+
+    - ``ratio_range is None and multiscale_mode == "range"``: randomly sample a
+    scale from the a range. (mode 3)
+
+    - ``ratio_range is None and multiscale_mode == "value"``: randomly sample a
+    scale from multiple scales. (mode 4)
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+            Default:None.
+        multiscale_mode (str): Either "range" or "value".
+            Default: 'range'
+        ratio_range (tuple[float]): (min_ratio, max_ratio).
+            Default: None
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: True
+        min_size (int, optional): The minimum size for input and the shape
+            of the image and seg map will not be less than ``min_size``.
+            As the shape of model input is fixed like 'SETR' and 'BEiT'.
+            Following the setting in these models, resized images must be
+            bigger than the crop size in ``slide_inference``. Default: None
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 min_size=None):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given img_scale=None and a range of image ratio
+            # mode 2: given a scale and a range of image ratio
+            assert self.img_scale is None or len(self.img_scale) == 1
+        else:
+            # mode 3 and 4: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        self.min_size = min_size
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``,
+                where ``img_scale`` is the selected image scale and
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where
+                ``img_scale`` is sampled scale and None is just a placeholder
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and
+                None is just a placeholder to be consistent with
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            if self.img_scale is None:
+                h, w = results['img'].shape[:2]
+                scale, scale_idx = self.random_sample_ratio((w, h),
+                                                            self.ratio_range)
+            else:
+                scale, scale_idx = self.random_sample_ratio(
+                    self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if self.keep_ratio:
+            if self.min_size is not None:
+                # TODO: Now 'min_size' is an 'int' which means the minimum
+                # shape of images is (min_size, min_size, 3). 'min_size'
+                # with tuple type will be supported, i.e. the width and
+                # height are not equal.
+                if min(results['scale']) < self.min_size:
+                    new_short = self.min_size
+                else:
+                    new_short = min(results['scale'])
+
+                h, w = results['img'].shape[:2]
+                if h > w:
+                    new_h, new_w = new_short * h / w, new_short
+                else:
+                    new_h, new_w = new_short, new_short * w / h
+                results['scale'] = (new_h, new_w)
+
+            img, scale_factor = mmcv.imrescale(
+                results['img'], results['scale'], return_scale=True)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            new_h, new_w = img.shape[:2]
+            h, w = results['img'].shape[:2]
+            w_scale = new_w / w
+            h_scale = new_h / h
+        else:
+            img, w_scale, h_scale = mmcv.imresize(
+                results['img'], results['scale'], return_scale=True)
+        scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                dtype=np.float32)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape  # in case that there is no padding
+        results['scale_factor'] = scale_factor
+        results['keep_ratio'] = self.keep_ratio
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key], results['scale'], interpolation='nearest')
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key], results['scale'], interpolation='nearest')
+            results[key] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor',
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            self._random_scale(results)
+        self._resize_img(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(img_scale={self.img_scale}, '
+                     f'multiscale_mode={self.multiscale_mode}, '
+                     f'ratio_range={self.ratio_range}, '
+                     f'keep_ratio={self.keep_ratio})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip(object):
+    """Flip the image & seg.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Args:
+        prob (float, optional): The flipping probability. Default: None.
+        direction(str, optional): The flipping direction. Options are
+            'horizontal' and 'vertical'. Default: 'horizontal'.
+    """
+
+    @deprecated_api_warning({'flip_ratio': 'prob'}, cls_name='RandomFlip')
+    def __init__(self, prob=None, direction='horizontal'):
+        self.prob = prob
+        self.direction = direction
+        if prob is not None:
+            assert prob >= 0 and prob <= 1
+        assert direction in ['horizontal', 'vertical']
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added into
+                result dict.
+        """
+
+        if 'flip' not in results:
+            flip = True if np.random.rand() < self.prob else False
+            results['flip'] = flip
+        if 'flip_direction' not in results:
+            results['flip_direction'] = self.direction
+        if results['flip']:
+            # flip image
+            results['img'] = mmcv.imflip(
+                results['img'], direction=results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                # use copy() to make numpy stride positive
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction']).copy()
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(prob={self.prob})'
+
+
+@PIPELINES.register_module()
+class Pad(object):
+    """Pad the image & mask.
+
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+    """
+
+    def __init__(self,
+                 size=None,
+                 size_divisor=None,
+                 pad_val=0,
+                 seg_pad_val=255):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = mmcv.impad(
+                results['img'], shape=self.size, pad_val=self.pad_val)
+        elif self.size_divisor is not None:
+            padded_img = mmcv.impad_to_multiple(
+                results['img'], self.size_divisor, pad_val=self.pad_val)
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def _pad_seg(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.impad(
+                results[key],
+                shape=results['pad_shape'][:2],
+                pad_val=self.seg_pad_val)
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        self._pad_img(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, size_divisor={self.size_divisor}, ' \
+                    f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize(object):
+    """Normalize the image.
+
+    Added key is "img_norm_cfg".
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+
+        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
+                                          self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb=' \
+                    f'{self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Rerange(object):
+    """Rerange the image pixel value.
+
+    Args:
+        min_value (float or int): Minimum value of the reranged image.
+            Default: 0.
+        max_value (float or int): Maximum value of the reranged image.
+            Default: 255.
+    """
+
+    def __init__(self, min_value=0, max_value=255):
+        assert isinstance(min_value, float) or isinstance(min_value, int)
+        assert isinstance(max_value, float) or isinstance(max_value, int)
+        assert min_value < max_value
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def __call__(self, results):
+        """Call function to rerange images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Reranged results.
+        """
+
+        img = results['img']
+        img_min_value = np.min(img)
+        img_max_value = np.max(img)
+
+        assert img_min_value < img_max_value
+        # rerange to [0, 1]
+        img = (img - img_min_value) / (img_max_value - img_min_value)
+        # rerange to [min_value, max_value]
+        img = img * (self.max_value - self.min_value) + self.min_value
+        results['img'] = img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_value={self.min_value}, max_value={self.max_value})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CLAHE(object):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+    """
+
+    def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)):
+        assert isinstance(clip_limit, (float, int))
+        self.clip_limit = clip_limit
+        assert is_tuple_of(tile_grid_size, int)
+        assert len(tile_grid_size) == 2
+        self.tile_grid_size = tile_grid_size
+
+    def __call__(self, results):
+        """Call function to Use CLAHE method process images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        for i in range(results['img'].shape[2]):
+            results['img'][:, :, i] = mmcv.clahe(
+                np.array(results['img'][:, :, i], dtype=np.uint8),
+                self.clip_limit, self.tile_grid_size)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(clip_limit={self.clip_limit}, '\
+                    f'tile_grid_size={self.tile_grid_size})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop(object):
+    """Random crop the image & seg.
+
+    Args:
+        crop_size (tuple): Expected size after cropping, (h, w).
+        cat_max_ratio (float): The maximum ratio that single category could
+            occupy.
+    """
+
+    def __init__(self, crop_size, cat_max_ratio=1., ignore_index=255):
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.cat_max_ratio = cat_max_ratio
+        self.ignore_index = ignore_index
+
+    def get_crop_bbox(self, img):
+        """Randomly get a crop bounding box."""
+        margin_h = max(img.shape[0] - self.crop_size[0], 0)
+        margin_w = max(img.shape[1] - self.crop_size[1], 0)
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
+
+        return crop_y1, crop_y2, crop_x1, crop_x2
+
+    def crop(self, img, crop_bbox):
+        """Crop from ``img``"""
+        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        return img
+
+    def __call__(self, results):
+        """Call function to randomly crop images, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+
+        img = results['img']
+        crop_bbox = self.get_crop_bbox(img)
+        if self.cat_max_ratio < 1.:
+            # Repeat 10 times
+            for _ in range(10):
+                seg_temp = self.crop(results['gt_semantic_seg'], crop_bbox)
+                labels, cnt = np.unique(seg_temp, return_counts=True)
+                cnt = cnt[labels != self.ignore_index]
+                if len(cnt) > 1 and np.max(cnt) / np.sum(
+                        cnt) < self.cat_max_ratio:
+                    break
+                crop_bbox = self.get_crop_bbox(img)
+
+        # crop the image
+        img = self.crop(img, crop_bbox)
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = self.crop(results[key], crop_bbox)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_size={self.crop_size})'
+
+
+@PIPELINES.register_module()
+class RandomRotate(object):
+    """Rotate the image & seg.
+
+    Args:
+        prob (float): The rotation probability.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+        pad_val (float, optional): Padding value of image. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used. Default: None.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image. Default: False
+    """
+
+    def __init__(self,
+                 prob,
+                 degree,
+                 pad_val=0,
+                 seg_pad_val=255,
+                 center=None,
+                 auto_bound=False):
+        self.prob = prob
+        assert prob >= 0 and prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+        self.pal_val = pad_val
+        self.seg_pad_val = seg_pad_val
+        self.center = center
+        self.auto_bound = auto_bound
+
+    def __call__(self, results):
+        """Call function to rotate image, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+
+        rotate = True if np.random.rand() < self.prob else False
+        degree = np.random.uniform(min(*self.degree), max(*self.degree))
+        if rotate:
+            # rotate image
+            results['img'] = mmcv.imrotate(
+                results['img'],
+                angle=degree,
+                border_value=self.pal_val,
+                center=self.center,
+                auto_bound=self.auto_bound)
+
+            # rotate segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imrotate(
+                    results[key],
+                    angle=degree,
+                    border_value=self.seg_pad_val,
+                    center=self.center,
+                    auto_bound=self.auto_bound,
+                    interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, ' \
+                    f'degree={self.degree}, ' \
+                    f'pad_val={self.pal_val}, ' \
+                    f'seg_pad_val={self.seg_pad_val}, ' \
+                    f'center={self.center}, ' \
+                    f'auto_bound={self.auto_bound})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RGB2Gray(object):
+    """Convert RGB image to grayscale image.
+
+    This transform calculate the weighted mean of input image channels with
+    ``weights`` and then expand the channels to ``out_channels``. When
+    ``out_channels`` is None, the number of output channels is the same as
+    input channels.
+
+    Args:
+        out_channels (int): Expected number of output channels after
+            transforming. Default: None.
+        weights (tuple[float]): The weights to calculate the weighted mean.
+            Default: (0.299, 0.587, 0.114).
+    """
+
+    def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
+        assert out_channels is None or out_channels > 0
+        self.out_channels = out_channels
+        assert isinstance(weights, tuple)
+        for item in weights:
+            assert isinstance(item, (float, int))
+        self.weights = weights
+
+    def __call__(self, results):
+        """Call function to convert RGB image to grayscale image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with grayscale image.
+        """
+        img = results['img']
+        assert len(img.shape) == 3
+        assert img.shape[2] == len(self.weights)
+        weights = np.array(self.weights).reshape((1, 1, -1))
+        img = (img * weights).sum(2, keepdims=True)
+        if self.out_channels is None:
+            img = img.repeat(weights.shape[2], axis=2)
+        else:
+            img = img.repeat(self.out_channels, axis=2)
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(out_channels={self.out_channels}, ' \
+                    f'weights={self.weights})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class AdjustGamma(object):
+    """Using gamma correction to process the image.
+
+    Args:
+        gamma (float or int): Gamma value used in gamma correction.
+            Default: 1.0.
+    """
+
+    def __init__(self, gamma=1.0):
+        assert isinstance(gamma, float) or isinstance(gamma, int)
+        assert gamma > 0
+        self.gamma = gamma
+        inv_gamma = 1.0 / gamma
+        self.table = np.array([(i / 255.0)**inv_gamma * 255
+                               for i in np.arange(256)]).astype('uint8')
+
+    def __call__(self, results):
+        """Call function to process the image with gamma correction.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        results['img'] = mmcv.lut_transform(
+            np.array(results['img'], dtype=np.uint8), self.table)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(gamma={self.gamma})'
+
+
+@PIPELINES.register_module()
+class SegRescale(object):
+    """Rescale semantic segmentation maps.
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+    """
+
+    def __init__(self, scale_factor=1):
+        self.scale_factor = scale_factor
+
+    def __call__(self, results):
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key], self.scale_factor, interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion(object):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self, img, alpha=1, beta=0):
+        """Multiple with alpha and add beat with clip."""
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        """Brightness distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img):
+        """Contrast distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img):
+        """Saturation distortion."""
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1],
+                alpha=random.uniform(self.saturation_lower,
+                                     self.saturation_upper))
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def hue(self, img):
+        """Hue distortion."""
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :,
+                0] = (img[:, :, 0].astype(int) +
+                      random.randint(-self.hue_delta, self.hue_delta)) % 180
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['img']
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            img = self.contrast(img)
+
+        # random saturation
+        img = self.saturation(img)
+
+        # random hue
+        img = self.hue(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(brightness_delta={self.brightness_delta}, '
+                     f'contrast_range=({self.contrast_lower}, '
+                     f'{self.contrast_upper}), '
+                     f'saturation_range=({self.saturation_lower}, '
+                     f'{self.saturation_upper}), '
+                     f'hue_delta={self.hue_delta})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCutOut(object):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+    Args:
+        prob (float): cutout probability.
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+        seg_fill_in (int): The labels of pixel to fill in the dropped regions.
+            If seg_fill_in is None, skip. Default: None.
+    """
+
+    def __init__(self,
+                 prob,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0),
+                 seg_fill_in=None):
+
+        assert 0 <= prob and prob <= 1
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        if seg_fill_in is not None:
+            assert (isinstance(seg_fill_in, int) and 0 <= seg_fill_in
+                    and seg_fill_in <= 255)
+        self.prob = prob
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.seg_fill_in = seg_fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    def __call__(self, results):
+        """Call function to drop some regions of image."""
+        cutout = True if np.random.rand() < self.prob else False
+        if cutout:
+            h, w, c = results['img'].shape
+            n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+            for _ in range(n_holes):
+                x1 = np.random.randint(0, w)
+                y1 = np.random.randint(0, h)
+                index = np.random.randint(0, len(self.candidates))
+                if not self.with_ratio:
+                    cutout_w, cutout_h = self.candidates[index]
+                else:
+                    cutout_w = int(self.candidates[index][0] * w)
+                    cutout_h = int(self.candidates[index][1] * h)
+
+                x2 = np.clip(x1 + cutout_w, 0, w)
+                y2 = np.clip(y1 + cutout_h, 0, h)
+                results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+                if self.seg_fill_in is not None:
+                    for key in results.get('seg_fields', []):
+                        results[key][y1:y2, x1:x2] = self.seg_fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in}, '
+        repr_str += f'seg_fill_in={self.seg_fill_in})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomMosaic(object):
+    """Mosaic augmentation. Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Args:
+        prob (float): mosaic probability.
+        img_scale (Sequence[int]): Image size after mosaic pipeline of
+            a single image. The size of the output image is four times
+            that of a single image. The output image comprises 4 single images.
+            Default: (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Default: (0.5, 1.5).
+        pad_val (int): Pad value. Default: 0.
+        seg_pad_val (int): Pad value of segmentation map. Default: 255.
+    """
+
+    def __init__(self,
+                 prob,
+                 img_scale=(640, 640),
+                 center_ratio_range=(0.5, 1.5),
+                 pad_val=0,
+                 seg_pad_val=255):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(img_scale, tuple)
+        self.prob = prob
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    def __call__(self, results):
+        """Call function to make a mosaic of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mosaic transformed.
+        """
+        mosaic = True if np.random.rand() < self.prob else False
+        if mosaic:
+            results = self._mosaic_transform_img(results)
+            results = self._mosaic_transform_seg(results)
+        return results
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    def _mosaic_transform_img(self, results):
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        self.center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        self.center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_position = (self.center_x, self.center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                result_patch = copy.deepcopy(results)
+            else:
+                result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = result_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[0] / h_i,
+                                self.img_scale[1] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['ori_shape'] = mosaic_img.shape
+
+        return results
+
+    def _mosaic_transform_seg(self, results):
+        """Mosaic transform function for label annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        for key in results.get('seg_fields', []):
+            mosaic_seg = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.seg_pad_val,
+                dtype=results[key].dtype)
+
+            # mosaic center x, y
+            center_position = (self.center_x, self.center_y)
+
+            loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+            for i, loc in enumerate(loc_strs):
+                if loc == 'top_left':
+                    result_patch = copy.deepcopy(results)
+                else:
+                    result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+                gt_seg_i = result_patch[key]
+                h_i, w_i = gt_seg_i.shape[:2]
+                # keep_ratio resize
+                scale_ratio_i = min(self.img_scale[0] / h_i,
+                                    self.img_scale[1] / w_i)
+                gt_seg_i = mmcv.imresize(
+                    gt_seg_i,
+                    (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)),
+                    interpolation='nearest')
+
+                # compute the combine parameters
+                paste_coord, crop_coord = self._mosaic_combine(
+                    loc, center_position, gt_seg_i.shape[:2][::-1])
+                x1_p, y1_p, x2_p, y2_p = paste_coord
+                x1_c, y1_c, x2_c, y2_c = crop_coord
+
+                # crop and paste image
+                mosaic_seg[y1_p:y2_p, x1_p:x2_p] = gt_seg_i[y1_c:y2_c,
+                                                            x1_c:x2_c]
+
+            results[key] = mosaic_seg
+
+        return results
+
+    def _mosaic_combine(self, loc, center_position_xy, img_shape_wh):
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'seg_pad_val={self.pad_val})'
+        return repr_str
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/potsdam.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..69aaf685f64d9edd98d3a67c36605014d86168a5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/potsdam.py
@@ -0,0 +1,38 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class PotsdamDataset(CustomDataset):
+    """ISPRS Potsdam dataset.
+
+    In segmentation map annotation for Potsdam dataset, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    CLASSES = ('impervious_surface', 'building', 'low_vegetation', 'tree',
+               'car', 'clutter')
+
+    PALETTE = [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+               [255, 255, 0], [255, 0, 0]]
+
+    def __init__(self, **kwargs):
+        super(PotsdamDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=True,
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f9c80c644caebdab78287d2bc2e6f535f3f0088
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/__init__.py
@@ -0,0 +1,17 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .distributed_sampler import DistributedSampler
+
+__all__ = ['DistributedSampler']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/distributed_sampler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..92871dc23cf0203dd675e0b4488117bd818356c7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,86 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from typing import Iterator, Optional
+
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+from mmseg.core.utils import sync_random_seed
+from mmseg.utils import get_device
+
+
+class DistributedSampler(_DistributedSampler):
+    """DistributedSampler inheriting from
+    `torch.utils.data.DistributedSampler`.
+
+    Args:
+        datasets (Dataset): the dataset will be loaded.
+        num_replicas (int, optional): Number of processes participating in
+            distributed training. By default, world_size is retrieved from the
+            current distributed group.
+        rank (int, optional):  Rank of the current process within num_replicas.
+            By default, rank is retrieved from the current distributed group.
+        shuffle (bool): If True (default), sampler will shuffle the indices.
+        seed (int): random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Default: ``0``.
+    """
+
+    def __init__(self,
+                 dataset: Dataset,
+                 num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None,
+                 shuffle: bool = True,
+                 seed=0) -> None:
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        device = get_device()
+        self.seed = sync_random_seed(seed, device)
+
+    def __iter__(self) -> Iterator:
+        """
+         Yields:
+            Iterator: iterator of indices for rank.
+        """
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            # When :attr:`shuffle=True`, this ensures all replicas
+            # use a different random ordering for each epoch.
+            # Otherwise, the next iteration of this sampler will
+            # yield the same ordering.
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/stare.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b952f06cfbafb95be0f9ed1ce2a6fb5a7477da
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/stare.py
@@ -0,0 +1,41 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class STAREDataset(CustomDataset):
+    """STARE dataset.
+
+    In segmentation map annotation for STARE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.ah.png'.
+    """
+
+    CLASSES = ('background', 'vessel')
+
+    PALETTE = [[120, 120, 120], [6, 230, 230]]
+
+    def __init__(self, **kwargs):
+        super(STAREDataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.ah.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert osp.exists(self.img_dir)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/voc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..76167e1f4d8e9423ffdcb396fbc2d99c56ad92d5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/datasets/voc.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class PascalVOCDataset(CustomDataset):
+    """Pascal VOC dataset.
+
+    Args:
+        split (str): Split txt file for Pascal VOC.
+    """
+
+    CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
+               'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
+               'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
+               'train', 'tvmonitor')
+
+    PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+               [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+               [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+               [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+               [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+    def __init__(self, split, **kwargs):
+        super(PascalVOCDataset, self).__init__(
+            img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs)
+        assert osp.exists(self.img_dir) and self.split is not None
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23d694ecf43793c66d8462b39df5b5bf5123276
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/__init__.py
@@ -0,0 +1,26 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
+                      build_head, build_loss, build_segmentor)
+from .decode_heads import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .segmentors import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
+    'build_head', 'build_loss', 'build_segmentor'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4460a6acbf119eb27ea59585845628e56015dbc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/__init__.py
@@ -0,0 +1,43 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .beit import BEiT
+from .bisenetv1 import BiSeNetV1
+from .bisenetv2 import BiSeNetV2
+from .cgnet import CGNet
+from .erfnet import ERFNet
+from .fast_scnn import FastSCNN
+from .hrnet import HRNet
+from .icnet import ICNet
+from .mae import MAE
+from .mit import MixVisionTransformer
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1c, ResNetV1d
+from .resnext import ResNeXt
+from .stdc import STDCContextPathNet, STDCNet
+from .swin import SwinTransformer
+from .timm_backbone import TIMMBackbone
+from .twins import PCPVT, SVT
+from .unet import UNet
+from .vit import VisionTransformer
+
+__all__ = [
+    'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
+    'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
+    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
+    'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'TIMMBackbone', 'ERFNet', 'PCPVT',
+    'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT', 'MAE'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/beit.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7419b0941a301632ed8c0693955a31fbdbb1fd59
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/beit.py
@@ -0,0 +1,572 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import PatchEmbed
+from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
+
+try:
+    from scipy import interpolate
+except ImportError:
+    interpolate = None
+
+
+class BEiTAttention(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 bias='qv_bias',
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.bias = bias
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        qkv_bias = bias
+        if bias == 'qv_bias':
+            self._init_qv_bias()
+            qkv_bias = False
+
+        self.window_size = window_size
+        self._init_rel_pos_embedding()
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+    def _init_qv_bias(self):
+        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
+        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))
+
+    def _init_rel_pos_embedding(self):
+        Wh, Ww = self.window_size
+        # cls to token & token 2 cls & cls to cls
+        self.num_relative_distance = (2 * Wh - 1) * (2 * Ww - 1) + 3
+        # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, self.num_heads))
+
+        # get pair-wise relative position index for
+        # each token inside the window
+        coords_h = torch.arange(Wh)
+        coords_w = torch.arange(Ww)
+        # coords shape is (2, Wh, Ww)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        # coords_flatten shape is (2, Wh*Ww)
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :])
+        # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        # shift to start from 0
+        relative_coords[:, :, 0] += Wh - 1
+        relative_coords[:, :, 1] += Ww - 1
+        relative_coords[:, :, 0] *= 2 * Ww - 1
+        relative_position_index = torch.zeros(
+            size=(Wh * Ww + 1, ) * 2, dtype=relative_coords.dtype)
+        # relative_position_index shape is (Wh*Ww, Wh*Ww)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x):
+        """
+        Args:
+            x (tensor): input features with shape of (num_windows*B, N, C).
+        """
+        B, N, C = x.shape
+
+        if self.bias == 'qv_bias':
+            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
+            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x)
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            Wh = self.window_size[0]
+            Ww = self.window_size[1]
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    Wh * Ww + 1, Wh * Ww + 1, -1)
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class BEiTTransformerEncoderLayer(VisionTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (tuple[int], optional): The height and width of the window.
+            Default: None.
+        init_values (float, optional): Initialize the values of BEiTAttention
+            and FFN with learnable scaling. Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 bias='qv_bias',
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=None,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(add_identity=False),
+                 init_values=None):
+        attn_cfg.update(dict(window_size=window_size, qk_scale=None))
+
+        super(BEiTTransformerEncoderLayer, self).__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=feedforward_channels,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=0.,
+            drop_rate=0.,
+            num_fcs=num_fcs,
+            qkv_bias=bias,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            attn_cfg=attn_cfg,
+            ffn_cfg=ffn_cfg)
+
+        # NOTE: drop path for stochastic depth, we shall see if
+        # this is better than dropout here
+        dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate)
+        self.drop_path = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+        self.gamma_1 = nn.Parameter(
+            init_values * torch.ones((embed_dims)), requires_grad=True)
+        self.gamma_2 = nn.Parameter(
+            init_values * torch.ones((embed_dims)), requires_grad=True)
+
+    def build_attn(self, attn_cfg):
+        self.attn = BEiTAttention(**attn_cfg)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.gamma_2 * self.ffn(self.norm2(x)))
+        return x
+
+
+@BACKBONES.register_module()
+class BEiT(BaseModule):
+    """BERT Pre-Training of Image Transformers.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_layers (int): Depth of transformer. Default: 12.
+        num_heads (int): Number of attention heads. Default: 12.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qv_bias (bool): Enable bias for qv if True. Default: True.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_values (float): Initialize the values of BEiTAttention and FFN
+            with learnable scaling.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 qv_bias=True,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super(BEiT, self).__init__(init_cfg=init_cfg)
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.norm_eval = norm_eval
+        self.pretrained = pretrained
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.num_fcs = num_fcs
+        self.qv_bias = qv_bias
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.patch_norm = patch_norm
+        self.init_values = init_values
+        self.window_size = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.patch_shape = self.window_size
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self._build_patch_embedding()
+        self._build_layers()
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+    def _build_patch_embedding(self):
+        """Build patch embedding layer."""
+        self.patch_embed = PatchEmbed(
+            in_channels=self.in_channels,
+            embed_dims=self.embed_dims,
+            conv_type='Conv2d',
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding=0,
+            norm_cfg=self.norm_cfg if self.patch_norm else None,
+            init_cfg=None)
+
+    def _build_layers(self):
+        """Build transformer encoding layers."""
+
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                BEiTTransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias='qv_bias' if self.qv_bias else False,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.window_size,
+                    init_values=self.init_values))
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def _geometric_sequence_interpolation(self, src_size, dst_size, sequence,
+                                          num):
+        """Get new sequence via geometric sequence interpolation.
+
+        Args:
+            src_size (int): Pos_embedding size in pre-trained model.
+            dst_size (int): Pos_embedding size in the current model.
+            sequence (tensor): The relative position bias of the pretrain
+                model after removing the extra tokens.
+            num (int): Number of attention heads.
+        Returns:
+            new_sequence (tensor): Geometric sequence interpolate the
+                pre-trained relative position bias to the size of
+                the current model.
+        """
+
+        def geometric_progression(a, r, n):
+            return a * (1.0 - r**n) / (1.0 - r)
+
+        # Here is a binary function.
+        left, right = 1.01, 1.5
+        while right - left > 1e-6:
+            q = (left + right) / 2.0
+            gp = geometric_progression(1, q, src_size // 2)
+            if gp > dst_size // 2:
+                right = q
+            else:
+                left = q
+        # The position of each interpolated point is determined
+        # by the ratio obtained by dichotomy.
+        dis = []
+        cur = 1
+        for i in range(src_size // 2):
+            dis.append(cur)
+            cur += q**(i + 1)
+        r_ids = [-_ for _ in reversed(dis)]
+        x = r_ids + [0] + dis
+        y = r_ids + [0] + dis
+        t = dst_size // 2.0
+        dx = np.arange(-t, t + 0.1, 1.0)
+        dy = np.arange(-t, t + 0.1, 1.0)
+        # Interpolation functions are being executed and called.
+        new_sequence = []
+        for i in range(num):
+            z = sequence[:, i].view(src_size, src_size).float().numpy()
+            f = interpolate.interp2d(x, y, z, kind='cubic')
+            new_sequence.append(
+                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(sequence))
+        new_sequence = torch.cat(new_sequence, dim=-1)
+        return new_sequence
+
+    def resize_rel_pos_embed(self, checkpoint):
+        """Resize relative pos_embed weights.
+
+        This function is modified from
+        https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/checkpoint.py.  # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        Args:
+            checkpoint (dict): Key and value of the pretrain model.
+        Returns:
+            state_dict (dict): Interpolate the relative pos_embed weights
+                in the pre-train model to the current model size.
+        """
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+
+        all_keys = list(state_dict.keys())
+        for key in all_keys:
+            if 'relative_position_index' in key:
+                state_dict.pop(key)
+            # In order to keep the center of pos_bias as consistent as
+            # possible after interpolation, and vice versa in the edge
+            # area, the geometric sequence interpolation method is adopted.
+            if 'relative_position_bias_table' in key:
+                rel_pos_bias = state_dict[key]
+                src_num_pos, num_attn_heads = rel_pos_bias.size()
+                dst_num_pos, _ = self.state_dict()[key].size()
+                dst_patch_shape = self.patch_shape
+                if dst_patch_shape[0] != dst_patch_shape[1]:
+                    raise NotImplementedError()
+                # Count the number of extra tokens.
+                num_extra_tokens = dst_num_pos - (
+                    dst_patch_shape[0] * 2 - 1) * (
+                        dst_patch_shape[1] * 2 - 1)
+                src_size = int((src_num_pos - num_extra_tokens)**0.5)
+                dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+                if src_size != dst_size:
+                    extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                    rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                    new_rel_pos_bias = self._geometric_sequence_interpolation(
+                        src_size, dst_size, rel_pos_bias, num_attn_heads)
+                    new_rel_pos_bias = torch.cat(
+                        (new_rel_pos_bias, extra_tokens), dim=0)
+                    state_dict[key] = new_rel_pos_bias
+
+        return state_dict
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            logger = get_root_logger()
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super(BEiT, self).init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super(BEiT, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv1.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv1.py
new file mode 100644
index 0000000000000000000000000000000000000000..8521a54c6e955b216e29e7929c0b563e43fe330e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv1.py
@@ -0,0 +1,345 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+
+
+class SpatialPath(BaseModule):
+    """Spatial Path to preserve the spatial size of the original input image
+    and encode affluent spatial information.
+
+    Args:
+        in_channels(int): The number of channels of input
+            image. Default: 3.
+        num_channels (Tuple[int]): The number of channels of
+            each layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+    Returns:
+        x (torch.Tensor): Feature map for Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(64, 64, 64, 128),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(SpatialPath, self).__init__(init_cfg=init_cfg)
+        assert len(num_channels) == 4, 'Length of input channels \
+                                        of Spatial Path must be 4!'
+
+        self.layers = []
+        for i in range(len(num_channels)):
+            layer_name = f'layer{i + 1}'
+            self.layers.append(layer_name)
+            if i == 0:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=num_channels[i],
+                        kernel_size=7,
+                        stride=2,
+                        padding=3,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            elif i == len(num_channels) - 1:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            else:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, x):
+        for i, layer_name in enumerate(self.layers):
+            layer_stage = getattr(self, layer_name)
+            x = layer_stage(x)
+        return x
+
+
+class AttentionRefinementModule(BaseModule):
+    """Attention Refinement Module (ARM) to refine the features of each stage.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Attention Refinement Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
+        self.conv_layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.atten_conv_layer = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x_atten = self.atten_conv_layer(x)
+        x_out = x * x_atten
+        return x_out
+
+
+class ContextPath(BaseModule):
+    """Context Path to provide sufficient receptive field.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        context_channels (Tuple[int]): The number of channel numbers
+            of various modules in Context Path.
+            Default: (128, 256, 512).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+    Returns:
+        x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
+            undergoing upsampling from 1/16 and 1/32 downsampling
+            feature maps. These two feature maps are used for Feature
+            Fusion Module and Auxiliary Head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 context_channels=(128, 256, 512),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(ContextPath, self).__init__(init_cfg=init_cfg)
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.backbone = build_backbone(backbone_cfg)
+
+        self.align_corners = align_corners
+        self.arm16 = AttentionRefinementModule(context_channels[1],
+                                               context_channels[0])
+        self.arm32 = AttentionRefinementModule(context_channels[2],
+                                               context_channels[0])
+        self.conv_head32 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_head16 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap_conv = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=context_channels[2],
+                out_channels=context_channels[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+        x_4, x_8, x_16, x_32 = self.backbone(x)
+        x_gap = self.gap_conv(x_32)
+
+        x_32_arm = self.arm32(x_32)
+        x_32_sum = x_32_arm + x_gap
+        x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
+        x_32_up = self.conv_head32(x_32_up)
+
+        x_16_arm = self.arm16(x_16)
+        x_16_sum = x_16_arm + x_32_up
+        x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
+        x_16_up = self.conv_head16(x_16_up)
+
+        return x_16_up, x_32_up
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module to fuse low level output feature of Spatial Path
+    and high level output feature of Context Path.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv_atten = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg), nn.Sigmoid())
+
+    def forward(self, x_sp, x_cp):
+        x_concat = torch.cat([x_sp, x_cp], dim=1)
+        x_fuse = self.conv1(x_concat)
+        x_atten = self.gap(x_fuse)
+        # Note: No BN and more 1x1 conv in paper.
+        x_atten = self.conv_atten(x_atten)
+        x_atten = x_fuse * x_atten
+        x_out = x_atten + x_fuse
+        return x_out
+
+
+@BACKBONES.register_module()
+class BiSeNetV1(BaseModule):
+    """BiSeNetV1 backbone.
+
+    This backbone is the implementation of `BiSeNet: Bilateral
+    Segmentation Network for Real-time Semantic
+    Segmentation <https://arxiv.org/abs/1808.00897>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        spatial_channels (Tuple[int]): Size of channel numbers of
+            various layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+        context_channels (Tuple[int]): Size of channel numbers of
+            various modules in Context Path.
+            Default: (128, 256, 512).
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        out_channels(int): The number of channels of output.
+            It must be the same with `in_channels` of decode_head.
+            Default: 256.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 spatial_channels=(64, 64, 64, 128),
+                 context_channels=(128, 256, 512),
+                 out_indices=(0, 1, 2),
+                 align_corners=False,
+                 out_channels=256,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
+        assert len(spatial_channels) == 4, 'Length of input channels \
+                                           of Spatial Path must be 4!'
+
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.out_indices = out_indices
+        self.align_corners = align_corners
+        self.context_path = ContextPath(backbone_cfg, context_channels,
+                                        self.align_corners)
+        self.spatial_path = SpatialPath(in_channels, spatial_channels)
+        self.ffm = FeatureFusionModule(context_channels[1], out_channels)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+    def forward(self, x):
+        # stole refactoring code from Coin Cheung, thanks
+        x_context8, x_context16 = self.context_path(x)
+        x_spatial = self.spatial_path(x)
+        x_fuse = self.ffm(x_spatial, x_context8)
+
+        outs = [x_fuse, x_context8, x_context16]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv2.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f9832c7fba22a49995479c4cb90258e018539ca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/bisenetv2.py
@@ -0,0 +1,635 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_activation_layer, build_norm_layer)
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES
+
+
+class DetailBranch(BaseModule):
+    """Detail Branch with wide channels and shallow layers to capture low-level
+    details and generate high-resolution feature representation.
+
+    Args:
+        detail_channels (Tuple[int]): Size of channel numbers of each stage
+            in Detail Branch, in paper it has 3 stages.
+            Default: (64, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Feature map of Detail Branch.
+    """
+
+    def __init__(self,
+                 detail_channels=(64, 64, 128),
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(DetailBranch, self).__init__(init_cfg=init_cfg)
+        detail_branch = []
+        for i in range(len(detail_channels)):
+            if i == 0:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=in_channels,
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+            else:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=detail_channels[i - 1],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+        self.detail_branch = nn.ModuleList(detail_branch)
+
+    def forward(self, x):
+        for stage in self.detail_branch:
+            x = stage(x)
+        return x
+
+
+class StemBlock(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): First feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(StemBlock, self).__init__(init_cfg=init_cfg)
+
+        self.conv_first = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.convs = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels // 2,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=out_channels // 2,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.pool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=False)
+        self.fuse_last = ConvModule(
+            in_channels=out_channels * 2,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv_first(x)
+        x_left = self.convs(x)
+        x_right = self.pool(x)
+        x = self.fuse_last(torch.cat([x_left, x_right], dim=1))
+        return x
+
+
+class GELayer(BaseModule):
+    """Gather-and-Expansion Layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        stride (int): Stride of GELayer. Default: 1
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Intermediate feature map in
+            Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 exp_ratio=6,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(GELayer, self).__init__(init_cfg=init_cfg)
+        mid_channel = in_channels * exp_ratio
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if stride == 1:
+            self.dwconv = nn.Sequential(
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.shortcut = None
+        else:
+            self.dwconv = nn.Sequential(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    bias=False,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=mid_channel,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    groups=mid_channel,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+            self.shortcut = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=None,
+                ))
+
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                in_channels=mid_channel,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+            ))
+
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.dwconv(x)
+        x = self.conv2(x)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(identity)
+            x = x + shortcut
+        else:
+            x = x + identity
+        x = self.act(x)
+        return x
+
+
+class CEBlock(BaseModule):
+    """Context Embedding Block for large receptive filed in Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Last feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(CEBlock, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.gap = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            build_norm_layer(norm_cfg, self.in_channels)[1])
+        self.conv_gap = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # Note: in paper here is naive conv2d, no bn-relu
+        self.conv_last = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.gap(x)
+        x = self.conv_gap(x)
+        x = identity + x
+        x = self.conv_last(x)
+        return x
+
+
+class SemanticBranch(BaseModule):
+    """Semantic Branch which is lightweight with narrow channels and deep
+    layers to obtain　high-level semantic context.
+
+    Args:
+        semantic_channels(Tuple[int]): Size of channel numbers of
+            various stages in Semantic Branch.
+            Default: (16, 32, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        semantic_outs (List[torch.Tensor]): List of several feature maps
+            for auxiliary heads (Booster) and Bilateral
+            Guided Aggregation Layer.
+    """
+
+    def __init__(self,
+                 semantic_channels=(16, 32, 64, 128),
+                 in_channels=3,
+                 exp_ratio=6,
+                 init_cfg=None):
+        super(SemanticBranch, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_stages = []
+        for i in range(len(semantic_channels)):
+            stage_name = f'stage{i + 1}'
+            self.semantic_stages.append(stage_name)
+            if i == 0:
+                self.add_module(
+                    stage_name,
+                    StemBlock(self.in_channels, semantic_channels[i]))
+            elif i == (len(semantic_channels) - 1):
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+            else:
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+
+        self.add_module(f'stage{len(semantic_channels)}_CEBlock',
+                        CEBlock(semantic_channels[-1], semantic_channels[-1]))
+        self.semantic_stages.append(f'stage{len(semantic_channels)}_CEBlock')
+
+    def forward(self, x):
+        semantic_outs = []
+        for stage_name in self.semantic_stages:
+            semantic_stage = getattr(self, stage_name)
+            x = semantic_stage(x)
+            semantic_outs.append(x)
+        return semantic_outs
+
+
+class BGALayer(BaseModule):
+    """Bilateral Guided Aggregation Layer to fuse the complementary information
+    from both Detail Branch and Semantic Branch.
+
+    Args:
+        out_channels (int): Number of output channels.
+            Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        output (torch.Tensor): Output feature map for Segment heads.
+    """
+
+    def __init__(self,
+                 out_channels=128,
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(BGALayer, self).__init__(init_cfg=init_cfg)
+        self.out_channels = out_channels
+        self.align_corners = align_corners
+        self.detail_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.detail_down = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False))
+        self.semantic_conv = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None))
+        self.semantic_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.conv = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            inplace=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+
+    def forward(self, x_d, x_s):
+        detail_dwconv = self.detail_dwconv(x_d)
+        detail_down = self.detail_down(x_d)
+        semantic_conv = self.semantic_conv(x_s)
+        semantic_dwconv = self.semantic_dwconv(x_s)
+        semantic_conv = resize(
+            input=semantic_conv,
+            size=detail_dwconv.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        fuse_1 = detail_dwconv * torch.sigmoid(semantic_conv)
+        fuse_2 = detail_down * torch.sigmoid(semantic_dwconv)
+        fuse_2 = resize(
+            input=fuse_2,
+            size=fuse_1.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        output = self.conv(fuse_1 + fuse_2)
+        return output
+
+
+@BACKBONES.register_module()
+class BiSeNetV2(BaseModule):
+    """BiSeNetV2: Bilateral Network with Guided Aggregation for
+    Real-time Semantic Segmentation.
+
+    This backbone is the implementation of
+    `BiSeNetV2 <https://arxiv.org/abs/2004.02147>`_.
+
+    Args:
+        in_channels (int): Number of channel of input image. Default: 3.
+        detail_channels (Tuple[int], optional): Channels of each stage
+            in Detail Branch. Default: (64, 64, 128).
+        semantic_channels (Tuple[int], optional): Channels of each stage
+            in Semantic Branch. Default: (16, 32, 64, 128).
+            See Table 1 and Figure 3 of paper for more details.
+        semantic_expansion_ratio (int, optional): The expansion factor
+            expanding channel number of middle channels in Semantic Branch.
+            Default: 6.
+        bga_channels (int, optional): Number of middle channels in
+            Bilateral Guided Aggregation Layer. Default: 128.
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2, 3, 4).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 detail_channels=(64, 64, 128),
+                 semantic_channels=(16, 32, 64, 128),
+                 semantic_expansion_ratio=6,
+                 bga_channels=128,
+                 out_indices=(0, 1, 2, 3, 4),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+        super(BiSeNetV2, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_indices = out_indices
+        self.detail_channels = detail_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_expansion_ratio = semantic_expansion_ratio
+        self.bga_channels = bga_channels
+        self.align_corners = align_corners
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.detail = DetailBranch(self.detail_channels, self.in_channels)
+        self.semantic = SemanticBranch(self.semantic_channels,
+                                       self.in_channels,
+                                       self.semantic_expansion_ratio)
+        self.bga = BGALayer(self.bga_channels, self.align_corners)
+
+    def forward(self, x):
+        #  stole refactoring code from Coin Cheung, thanks
+        x_detail = self.detail(x)
+        x_semantic_lst = self.semantic(x)
+        x_head = self.bga(x_detail, x_semantic_lst[-1])
+        outs = [x_head] + x_semantic_lst[:-1]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/cgnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae4fa733cea09e81c199f3a9df3c91f0b524d0b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/cgnet.py
@@ -0,0 +1,385 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+
+
+class GlobalContextExtractor(nn.Module):
+    """Global Context Extractor for CGNet.
+
+    This class is employed to refine the joint feature of both local feature
+    and surrounding context.
+
+    Args:
+        channel (int): Number of input feature channels.
+        reduction (int): Reductions for global context extractor. Default: 16.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self, channel, reduction=16, with_cp=False):
+        super(GlobalContextExtractor, self).__init__()
+        self.channel = channel
+        self.reduction = reduction
+        assert reduction >= 1 and channel >= reduction
+        self.with_cp = with_cp
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel), nn.Sigmoid())
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_batch, num_channel = x.size()[:2]
+            y = self.avg_pool(x).view(num_batch, num_channel)
+            y = self.fc(y).view(num_batch, num_channel, 1, 1)
+            return x * y
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class ContextGuidedBlock(nn.Module):
+    """Context Guided Block for CGNet.
+
+    This class consists of four components: local feature extractor,
+    surrounding feature extractor, joint feature extractor and global
+    context extractor.
+
+    Args:
+        in_channels (int): Number of input feature channels.
+        out_channels (int): Number of output feature channels.
+        dilation (int): Dilation rate for surrounding context extractor.
+            Default: 2.
+        reduction (int): Reduction for global context extractor. Default: 16.
+        skip_connect (bool): Add input to output or not. Default: True.
+        downsample (bool): Downsample the input to 1/2 or not. Default: False.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilation=2,
+                 reduction=16,
+                 skip_connect=True,
+                 downsample=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 with_cp=False):
+        super(ContextGuidedBlock, self).__init__()
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        channels = out_channels if downsample else out_channels // 2
+        if 'type' in act_cfg and act_cfg['type'] == 'PReLU':
+            act_cfg['num_parameters'] = channels
+        kernel_size = 3 if downsample else 1
+        stride = 2 if downsample else 1
+        padding = (kernel_size - 1) // 2
+
+        self.conv1x1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size,
+            stride,
+            padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.f_loc = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            groups=channels,
+            bias=False)
+        self.f_sur = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=dilation,
+            groups=channels,
+            dilation=dilation,
+            bias=False)
+
+        self.bn = build_norm_layer(norm_cfg, 2 * channels)[1]
+        self.activate = nn.PReLU(2 * channels)
+
+        if downsample:
+            self.bottleneck = build_conv_layer(
+                conv_cfg,
+                2 * channels,
+                out_channels,
+                kernel_size=1,
+                bias=False)
+
+        self.skip_connect = skip_connect and not downsample
+        self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = self.conv1x1(x)
+            loc = self.f_loc(out)
+            sur = self.f_sur(out)
+
+            joi_feat = torch.cat([loc, sur], 1)  # the joint feature
+            joi_feat = self.bn(joi_feat)
+            joi_feat = self.activate(joi_feat)
+            if self.downsample:
+                joi_feat = self.bottleneck(joi_feat)  # channel = out_channels
+            # f_glo is employed to refine the joint feature
+            out = self.f_glo(joi_feat)
+
+            if self.skip_connect:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InputInjection(nn.Module):
+    """Downsampling module for CGNet."""
+
+    def __init__(self, num_downsampling):
+        super(InputInjection, self).__init__()
+        self.pool = nn.ModuleList()
+        for i in range(num_downsampling):
+            self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        for pool in self.pool:
+            x = pool(x)
+        return x
+
+
+@BACKBONES.register_module()
+class CGNet(BaseModule):
+    """CGNet backbone.
+
+    This backbone is the implementation of `A Light-weight Context Guided
+    Network for Semantic Segmentation <https://arxiv.org/abs/1811.08201>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Normally 3.
+        num_channels (tuple[int]): Numbers of feature channels at each stages.
+            Default: (32, 64, 128).
+        num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2.
+            Default: (3, 21).
+        dilations (tuple[int]): Dilation rate for surrounding context
+            extractors at stage 1 and stage 2. Default: (2, 4).
+        reductions (tuple[int]): Reductions for global context extractors at
+            stage 1 and stage 2. Default: (8, 16).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(32, 64, 128),
+                 num_blocks=(3, 21),
+                 dilations=(2, 4),
+                 reductions=(8, 16),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+
+        super(CGNet, self).__init__(init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm']),
+                    dict(type='Constant', val=0, layer='PReLU')
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.num_channels = num_channels
+        assert isinstance(self.num_channels, tuple) and len(
+            self.num_channels) == 3
+        self.num_blocks = num_blocks
+        assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2
+        self.dilations = dilations
+        assert isinstance(self.dilations, tuple) and len(self.dilations) == 2
+        self.reductions = reductions
+        assert isinstance(self.reductions, tuple) and len(self.reductions) == 2
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU':
+            self.act_cfg['num_parameters'] = num_channels[0]
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        cur_channels = in_channels
+        self.stem = nn.ModuleList()
+        for i in range(3):
+            self.stem.append(
+                ConvModule(
+                    cur_channels,
+                    num_channels[0],
+                    3,
+                    2 if i == 0 else 1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            cur_channels = num_channels[0]
+
+        self.inject_2x = InputInjection(1)  # down-sample for Input, factor=2
+        self.inject_4x = InputInjection(2)  # down-sample for Input, factor=4
+
+        cur_channels += in_channels
+        self.norm_prelu_0 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 1
+        self.level1 = nn.ModuleList()
+        for i in range(num_blocks[0]):
+            self.level1.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[1],
+                    num_channels[1],
+                    dilations[0],
+                    reductions[0],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[1] + in_channels
+        self.norm_prelu_1 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 2
+        self.level2 = nn.ModuleList()
+        for i in range(num_blocks[1]):
+            self.level2.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[2],
+                    num_channels[2],
+                    dilations[1],
+                    reductions[1],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[2]
+        self.norm_prelu_2 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+    def forward(self, x):
+        output = []
+
+        # stage 0
+        inp_2x = self.inject_2x(x)
+        inp_4x = self.inject_4x(x)
+        for layer in self.stem:
+            x = layer(x)
+        x = self.norm_prelu_0(torch.cat([x, inp_2x], 1))
+        output.append(x)
+
+        # stage 1
+        for i, layer in enumerate(self.level1):
+            x = layer(x)
+            if i == 0:
+                down1 = x
+        x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1))
+        output.append(x)
+
+        # stage 2
+        for i, layer in enumerate(self.level2):
+            x = layer(x)
+            if i == 0:
+                down2 = x
+        x = self.norm_prelu_2(torch.cat([down2, x], 1))
+        output.append(x)
+
+        return output
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(CGNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/erfnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/erfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0129604a8452f40ee70be12206540c6430177ea1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/erfnet.py
@@ -0,0 +1,342 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES
+
+
+class DownsamplerBlock(BaseModule):
+    """Downsampler block of ERFNet.
+
+    This module is a little different from basical ConvModule.
+    The features from Conv and MaxPool layers are
+    concatenated before BatchNorm.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(DownsamplerBlock, self).__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            out_channels - in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        conv_out = self.conv(input)
+        pool_out = self.pool(input)
+        pool_out = resize(
+            input=pool_out,
+            size=conv_out.size()[2:],
+            mode='bilinear',
+            align_corners=False)
+        output = torch.cat([conv_out, pool_out], 1)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+class NonBottleneck1d(BaseModule):
+    """Non-bottleneck block of ERFNet.
+
+    Args:
+        channels (int): Number of channels in Non-bottleneck block.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        dilation (int): Dilation rate for last two conv layers.
+            Default 1.
+        num_conv_layer (int): Number of 3x1 and 1x3 convolution layers.
+            Default 2.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 channels,
+                 drop_rate=0,
+                 dilation=1,
+                 num_conv_layer=2,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(NonBottleneck1d, self).__init__(init_cfg=init_cfg)
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.act = build_activation_layer(self.act_cfg)
+
+        self.convs_layers = nn.ModuleList()
+        for conv_layer in range(num_conv_layer):
+            first_conv_padding = (1, 0) if conv_layer == 0 else (dilation, 0)
+            first_conv_dilation = 1 if conv_layer == 0 else (dilation, 1)
+            second_conv_padding = (0, 1) if conv_layer == 0 else (0, dilation)
+            second_conv_dilation = 1 if conv_layer == 0 else (1, dilation)
+
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(3, 1),
+                    stride=1,
+                    padding=first_conv_padding,
+                    bias=True,
+                    dilation=first_conv_dilation))
+            self.convs_layers.append(self.act)
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(1, 3),
+                    stride=1,
+                    padding=second_conv_padding,
+                    bias=True,
+                    dilation=second_conv_dilation))
+            self.convs_layers.append(
+                build_norm_layer(self.norm_cfg, channels)[1])
+            if conv_layer == 0:
+                self.convs_layers.append(self.act)
+            else:
+                self.convs_layers.append(nn.Dropout(p=drop_rate))
+
+    def forward(self, input):
+        output = input
+        for conv in self.convs_layers:
+            output = conv(output)
+        output = self.act(output + input)
+        return output
+
+
+class UpsamplerBlock(BaseModule):
+    """Upsampler block of ERFNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(UpsamplerBlock, self).__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            output_padding=1,
+            bias=True)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        output = self.conv(input)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+@BACKBONES.register_module()
+class ERFNet(BaseModule):
+    """ERFNet backbone.
+
+    This backbone is the implementation of `ERFNet: Efficient Residual
+    Factorized ConvNet for Real-time SemanticSegmentation
+    <https://ieeexplore.ieee.org/document/8063438>`_.
+
+    Args:
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        enc_downsample_channels (Tuple[int]): Size of channel
+            numbers of various Downsampler block in encoder.
+            Default: (16, 64, 128).
+        enc_stage_non_bottlenecks (Tuple[int]): Number of stages of
+            Non-bottleneck block in encoder.
+            Default: (5, 8).
+        enc_non_bottleneck_dilations (Tuple[int]): Dilation rate of each
+            stage of Non-bottleneck block of encoder.
+            Default: (2, 4, 8, 16).
+        enc_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in encoder.
+            Default: (64, 128).
+        dec_upsample_channels (Tuple[int]): Size of channel numbers of
+            various Deconvolution block in decoder.
+            Default: (64, 16).
+        dec_stages_non_bottleneck (Tuple[int]): Number of stages of
+            Non-bottleneck block in decoder.
+            Default: (2, 2).
+        dec_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in decoder.
+            Default: (64, 16).
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.1.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 enc_downsample_channels=(16, 64, 128),
+                 enc_stage_non_bottlenecks=(5, 8),
+                 enc_non_bottleneck_dilations=(2, 4, 8, 16),
+                 enc_non_bottleneck_channels=(64, 128),
+                 dec_upsample_channels=(64, 16),
+                 dec_stages_non_bottleneck=(2, 2),
+                 dec_non_bottleneck_channels=(64, 16),
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super(ERFNet, self).__init__(init_cfg=init_cfg)
+        assert len(enc_downsample_channels) \
+               == len(dec_upsample_channels)+1, 'Number of downsample\
+                     block of encoder does not \
+                    match number of upsample block of decoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_stage_non_bottlenecks)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_non_bottleneck_channels)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of channels of Non-bottleneck block of encoder!'
+        assert enc_stage_non_bottlenecks[-1] \
+               % len(enc_non_bottleneck_dilations) == 0, 'Number of \
+                    Non-bottleneck block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(dec_upsample_channels) \
+               == len(dec_stages_non_bottleneck), 'Number of \
+                upsample block of decoder does not match \
+                number of Non-bottleneck block of decoder!'
+        assert len(dec_stages_non_bottleneck) \
+               == len(dec_non_bottleneck_channels), 'Number of \
+                Non-bottleneck block of decoder does not match \
+                number of channels of Non-bottleneck block of decoder!'
+
+        self.in_channels = in_channels
+        self.enc_downsample_channels = enc_downsample_channels
+        self.enc_stage_non_bottlenecks = enc_stage_non_bottlenecks
+        self.enc_non_bottleneck_dilations = enc_non_bottleneck_dilations
+        self.enc_non_bottleneck_channels = enc_non_bottleneck_channels
+        self.dec_upsample_channels = dec_upsample_channels
+        self.dec_stages_non_bottleneck = dec_stages_non_bottleneck
+        self.dec_non_bottleneck_channels = dec_non_bottleneck_channels
+        self.dropout_ratio = dropout_ratio
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.encoder.append(
+            DownsamplerBlock(self.in_channels, enc_downsample_channels[0]))
+
+        for i in range(len(enc_downsample_channels) - 1):
+            self.encoder.append(
+                DownsamplerBlock(enc_downsample_channels[i],
+                                 enc_downsample_channels[i + 1]))
+            # Last part of encoder is some dilated NonBottleneck1d blocks.
+            if i == len(enc_downsample_channels) - 2:
+                iteration_times = int(enc_stage_non_bottlenecks[-1] /
+                                      len(enc_non_bottleneck_dilations))
+                for j in range(iteration_times):
+                    for k in range(len(enc_non_bottleneck_dilations)):
+                        self.encoder.append(
+                            NonBottleneck1d(enc_downsample_channels[-1],
+                                            self.dropout_ratio,
+                                            enc_non_bottleneck_dilations[k]))
+            else:
+                for j in range(enc_stage_non_bottlenecks[i]):
+                    self.encoder.append(
+                        NonBottleneck1d(enc_downsample_channels[i + 1],
+                                        self.dropout_ratio))
+
+        for i in range(len(dec_upsample_channels)):
+            if i == 0:
+                self.decoder.append(
+                    UpsamplerBlock(enc_downsample_channels[-1],
+                                   dec_non_bottleneck_channels[i]))
+            else:
+                self.decoder.append(
+                    UpsamplerBlock(dec_non_bottleneck_channels[i - 1],
+                                   dec_non_bottleneck_channels[i]))
+            for j in range(dec_stages_non_bottleneck[i]):
+                self.decoder.append(
+                    NonBottleneck1d(dec_non_bottleneck_channels[i]))
+
+    def forward(self, x):
+        for enc in self.encoder:
+            x = enc(x)
+        for dec in self.decoder:
+            x = dec(x)
+        return [x]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/fast_scnn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff53ffdfc1210e1f21df7a89a9e8898475cf2c8e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/fast_scnn.py
@@ -0,0 +1,422 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.models.decode_heads.psp_head import PPM
+from mmseg.ops import resize
+from ..builder import BACKBONES
+from ..utils import InvertedResidual
+
+
+class LearningToDownsample(nn.Module):
+    """Learning to downsample module.
+
+    Args:
+        in_channels (int): Number of input channels.
+        dw_channels (tuple[int]): Number of output channels of the first and
+            the second depthwise conv (dwconv) layers.
+        out_channels (int): Number of output channels of the whole
+            'learning to downsample' module.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 dw_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dw_act_cfg=None):
+        super(LearningToDownsample, self).__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.dw_act_cfg = dw_act_cfg
+        dw_channels1 = dw_channels[0]
+        dw_channels2 = dw_channels[1]
+
+        self.conv = ConvModule(
+            in_channels,
+            dw_channels1,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.dsconv1 = DepthwiseSeparableConvModule(
+            dw_channels1,
+            dw_channels2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+        self.dsconv2 = DepthwiseSeparableConvModule(
+            dw_channels2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Module):
+    """Global feature extractor module.
+
+    Args:
+        in_channels (int): Number of input channels of the GFE module.
+            Default: 64
+        block_channels (tuple[int]): Tuple of ints. Each int specifies the
+            number of output channels of each Inverted Residual module.
+            Default: (64, 96, 128)
+        out_channels(int): Number of output channels of the GFE module.
+            Default: 128
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+            Default: 6
+        num_blocks (tuple[int]): Tuple of ints. Each int specifies the
+            number of times each Inverted Residual module is repeated.
+            The repeated Inverted Residual modules are called a 'group'.
+            Default: (3, 3, 3)
+        strides (tuple[int]): Tuple of ints. Each int specifies
+            the downsampling factor of each 'group'.
+            Default: (2, 2, 1)
+        pool_scales (tuple[int]): Tuple of ints. Each int specifies
+            the parameter required in 'global average pooling' within PPM.
+            Default: (1, 2, 3, 6)
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+    """
+
+    def __init__(self,
+                 in_channels=64,
+                 block_channels=(64, 96, 128),
+                 out_channels=128,
+                 expand_ratio=6,
+                 num_blocks=(3, 3, 3),
+                 strides=(2, 2, 1),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False):
+        super(GlobalFeatureExtractor, self).__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        assert len(block_channels) == len(num_blocks) == 3
+        self.bottleneck1 = self._make_layer(in_channels, block_channels[0],
+                                            num_blocks[0], strides[0],
+                                            expand_ratio)
+        self.bottleneck2 = self._make_layer(block_channels[0],
+                                            block_channels[1], num_blocks[1],
+                                            strides[1], expand_ratio)
+        self.bottleneck3 = self._make_layer(block_channels[1],
+                                            block_channels[2], num_blocks[2],
+                                            strides[2], expand_ratio)
+        self.ppm = PPM(
+            pool_scales,
+            block_channels[2],
+            block_channels[2] // 4,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=align_corners)
+
+        self.out = ConvModule(
+            block_channels[2] * 2,
+            out_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _make_layer(self,
+                    in_channels,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_ratio=6):
+        layers = [
+            InvertedResidual(
+                in_channels,
+                out_channels,
+                stride,
+                expand_ratio,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+        for i in range(1, blocks):
+            layers.append(
+                InvertedResidual(
+                    out_channels,
+                    out_channels,
+                    1,
+                    expand_ratio,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = torch.cat([x, *self.ppm(x)], dim=1)
+        x = self.out(x)
+        return x
+
+
+class FeatureFusionModule(nn.Module):
+    """Feature fusion module.
+
+    Args:
+        higher_in_channels (int): Number of input channels of the
+            higher-resolution branch.
+        lower_in_channels (int): Number of input channels of the
+            lower-resolution branch.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        dwconv_act_cfg (dict): Config of activation layers in 3x3 conv.
+            Default: dict(type='ReLU').
+        conv_act_cfg (dict): Config of activation layers in the two 1x1 conv.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+
+    def __init__(self,
+                 higher_in_channels,
+                 lower_in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dwconv_act_cfg=dict(type='ReLU'),
+                 conv_act_cfg=None,
+                 align_corners=False):
+        super(FeatureFusionModule, self).__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dwconv_act_cfg = dwconv_act_cfg
+        self.conv_act_cfg = conv_act_cfg
+        self.align_corners = align_corners
+        self.dwconv = ConvModule(
+            lower_in_channels,
+            out_channels,
+            3,
+            padding=1,
+            groups=out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.dwconv_act_cfg)
+        self.conv_lower_res = ConvModule(
+            out_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.conv_higher_res = ConvModule(
+            higher_in_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.relu = nn.ReLU(True)
+
+    def forward(self, higher_res_feature, lower_res_feature):
+        lower_res_feature = resize(
+            lower_res_feature,
+            size=higher_res_feature.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        lower_res_feature = self.dwconv(lower_res_feature)
+        lower_res_feature = self.conv_lower_res(lower_res_feature)
+
+        higher_res_feature = self.conv_higher_res(higher_res_feature)
+        out = higher_res_feature + lower_res_feature
+        return self.relu(out)
+
+
+@BACKBONES.register_module()
+class FastSCNN(BaseModule):
+    """Fast-SCNN Backbone.
+
+    This backbone is the implementation of `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        downsample_dw_channels (tuple[int]): Number of output channels after
+            the first conv layer & the second conv layer in
+            Learning-To-Downsample (LTD) module.
+            Default: (32, 48).
+        global_in_channels (int): Number of input channels of
+            Global Feature Extractor(GFE).
+            Equal to number of output channels of LTD.
+            Default: 64.
+        global_block_channels (tuple[int]): Tuple of integers that describe
+            the output channels for each of the MobileNet-v2 bottleneck
+            residual blocks in GFE.
+            Default: (64, 96, 128).
+        global_block_strides (tuple[int]): Tuple of integers
+            that describe the strides (downsampling factors) for each of the
+            MobileNet-v2 bottleneck residual blocks in GFE.
+            Default: (2, 2, 1).
+        global_out_channels (int): Number of output channels of GFE.
+            Default: 128.
+        higher_in_channels (int): Number of input channels of the higher
+            resolution branch in FFM.
+            Equal to global_in_channels.
+            Default: 64.
+        lower_in_channels (int): Number of input channels of  the lower
+            resolution branch in FFM.
+            Equal to global_out_channels.
+            Default: 128.
+        fusion_out_channels (int): Number of output channels of FFM.
+            Default: 128.
+        out_indices (tuple): Tuple of indices of list
+            [higher_res_features, lower_res_features, fusion_output].
+            Often set to (0,1,2) to enable aux. heads.
+            Default: (0, 1, 2).
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 downsample_dw_channels=(32, 48),
+                 global_in_channels=64,
+                 global_block_channels=(64, 96, 128),
+                 global_block_strides=(2, 2, 1),
+                 global_out_channels=128,
+                 higher_in_channels=64,
+                 lower_in_channels=128,
+                 fusion_out_channels=128,
+                 out_indices=(0, 1, 2),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 dw_act_cfg=None,
+                 init_cfg=None):
+
+        super(FastSCNN, self).__init__(init_cfg)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+
+        if global_in_channels != higher_in_channels:
+            raise AssertionError('Global Input Channels must be the same \
+                                 with Higher Input Channels!')
+        elif global_out_channels != lower_in_channels:
+            raise AssertionError('Global Output Channels must be the same \
+                                with Lower Input Channels!')
+
+        self.in_channels = in_channels
+        self.downsample_dw_channels1 = downsample_dw_channels[0]
+        self.downsample_dw_channels2 = downsample_dw_channels[1]
+        self.global_in_channels = global_in_channels
+        self.global_block_channels = global_block_channels
+        self.global_block_strides = global_block_strides
+        self.global_out_channels = global_out_channels
+        self.higher_in_channels = higher_in_channels
+        self.lower_in_channels = lower_in_channels
+        self.fusion_out_channels = fusion_out_channels
+        self.out_indices = out_indices
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.learning_to_downsample = LearningToDownsample(
+            in_channels,
+            downsample_dw_channels,
+            global_in_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            dw_act_cfg=dw_act_cfg)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            global_in_channels,
+            global_block_channels,
+            global_out_channels,
+            strides=self.global_block_strides,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.feature_fusion = FeatureFusionModule(
+            higher_in_channels,
+            lower_in_channels,
+            fusion_out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dwconv_act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, x):
+        higher_res_features = self.learning_to_downsample(x)
+        lower_res_features = self.global_feature_extractor(higher_res_features)
+        fusion_output = self.feature_fusion(higher_res_features,
+                                            lower_res_features)
+
+        outs = [higher_res_features, lower_res_features, fusion_output]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/hrnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c36a8a613b5e902d224bce65d82150b9d408bac
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/hrnet.py
@@ -0,0 +1,655 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, ModuleList, Sequential
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmseg.ops import Upsample, resize
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        """Check branches configuration."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \
+                        f'{len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \
+                        f'{len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \
+                        f'{len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Build multiple branch."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Build fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            # we set align_corners=False for HRNet
+                            Upsample(
+                                scale_factor=2**(j - i),
+                                mode='bilinear',
+                                align_corners=False)))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                elif j > i:
+                    y = y + resize(
+                        self.fuse_layers[i][j](x[j]),
+                        size=x[i].shape[2:],
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    This backbone is the implementation of `High-Resolution Representations
+    for Labeling Pixels and Regions <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of convolution block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Use `BN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 frozen_stages=-1,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(HRNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        """Make each layer."""
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+
+            self.norm1.eval()
+            self.norm2.eval()
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, f'layer{i}')
+                t = getattr(self, f'transition{i}')
+            elif i == 4:
+                m = getattr(self, f'stage{i}')
+            else:
+                m = getattr(self, f'stage{i}')
+                t = getattr(self, f'transition{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+            t.eval()
+            for param in t.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/icnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/icnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a05842d56ecdad5699f80e71c0d77e1202094c67
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/icnet.py
@@ -0,0 +1,179 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+from ..decode_heads.psp_head import PPM
+
+
+@BACKBONES.register_module()
+class ICNet(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This backbone is the implementation of
+    `ICNet <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        backbone_cfg (dict): Config dict to build backbone. Usually it is
+            ResNet but it can also be other backbones.
+        in_channels (int): The number of input image channels. Default: 3.
+        layer_channels (Sequence[int]): The numbers of feature channels at
+            layer 2 and layer 4 in ResNet. It can also be other backbones.
+            Default: (512, 2048).
+        light_branch_middle_channels (int): The number of channels of the
+            middle layer in light branch. Default: 32.
+        psp_out_channels (int): The number of channels of the output of PSP
+            module. Default: 512.
+        out_channels (Sequence[int]): The numbers of output feature channels
+            at each branches. Default: (64, 256, 256).
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 layer_channels=(512, 2048),
+                 light_branch_middle_channels=32,
+                 psp_out_channels=512,
+                 out_channels=(64, 256, 256),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        if backbone_cfg is None:
+            raise TypeError('backbone_cfg must be passed from config file!')
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', mode='fan_out', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='_BatchNorm'),
+                dict(type='Normal', mean=0.01, layer='Linear')
+            ]
+        super(ICNet, self).__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.backbone = build_backbone(backbone_cfg)
+
+        # Note: Default `ceil_mode` is false in nn.MaxPool2d, set
+        # `ceil_mode=True` to keep information in the corner of feature map.
+        self.backbone.maxpool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=True)
+
+        self.psp_modules = PPM(
+            pool_scales=pool_scales,
+            in_channels=layer_channels[1],
+            channels=psp_out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            align_corners=align_corners)
+
+        self.psp_bottleneck = ConvModule(
+            layer_channels[1] + len(pool_scales) * psp_out_channels,
+            psp_out_channels,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv_sub1 = nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=out_channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        self.conv_sub2 = ConvModule(
+            layer_channels[0],
+            out_channels[1],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.conv_sub4 = ConvModule(
+            psp_out_channels,
+            out_channels[2],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        output = []
+
+        # sub 1
+        output.append(self.conv_sub1(x))
+
+        # sub 2
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.stem(x)
+        x = self.backbone.maxpool(x)
+        x = self.backbone.layer1(x)
+        x = self.backbone.layer2(x)
+        output.append(self.conv_sub2(x))
+
+        # sub 4
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.layer3(x)
+        x = self.backbone.layer4(x)
+        psp_outs = self.psp_modules(x) + [x]
+        psp_outs = torch.cat(psp_outs, dim=1)
+        x = self.psp_bottleneck(psp_outs)
+
+        output.append(self.conv_sub4(x))
+
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mae.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c9555365f3dc755a48b709b46c91af20147c06
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mae.py
@@ -0,0 +1,274 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.import math
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmcv.runner import ModuleList, _load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.utils import get_root_logger
+from ..builder import BACKBONES
+from .beit import BEiT, BEiTAttention, BEiTTransformerEncoderLayer
+
+
+class MAEAttention(BEiTAttention):
+    """Multi-head self-attention with relative position bias used in MAE.
+
+    This module is different from ``BEiTAttention`` by initializing the
+    relative bias table with zeros.
+    """
+
+    def init_weights(self):
+        """Initialize relative position bias with zeros."""
+
+        # As MAE initializes relative position bias as zeros and this class
+        # inherited from BEiT which initializes relative position bias
+        # with `trunc_normal`, `init_weights` here does
+        # nothing and just passes directly
+
+        pass
+
+
+class MAETransformerEncoderLayer(BEiTTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    This module is different from ``BEiTTransformerEncoderLayer`` by replacing
+    ``BEiTAttention`` with ``MAEAttention``.
+    """
+
+    def build_attn(self, attn_cfg):
+        self.attn = MAEAttention(**attn_cfg)
+
+
+@BACKBONES.register_module()
+class MAE(BEiT):
+    """VisionTransformer with support for patch.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_values (float): Initialize the values of Attention and FFN
+            with learnable scaling. Defaults to 0.1.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super(MAE, self).__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            out_indices=out_indices,
+            qv_bias=False,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            patch_norm=patch_norm,
+            final_norm=final_norm,
+            num_fcs=num_fcs,
+            norm_eval=norm_eval,
+            pretrained=pretrained,
+            init_values=init_values,
+            init_cfg=init_cfg)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self.num_patches = self.patch_shape[0] * self.patch_shape[1]
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, embed_dims))
+
+    def _build_layers(self):
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                MAETransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias=True,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.patch_shape,
+                    init_values=self.init_values))
+
+    def fix_init_weight(self):
+        """Rescale the initialization according to layer id.
+
+        This function is copied from  https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py. # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        """
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.layers):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.ffn.layers[1].weight.data, layer_id + 1)
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+        self.fix_init_weight()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            logger = get_root_logger()
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            state_dict = self.resize_abs_pos_embed(state_dict)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super(MAE, self).init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def resize_abs_pos_embed(self, state_dict):
+        if 'pos_embed' in state_dict:
+            pos_embed_checkpoint = state_dict['pos_embed']
+            embedding_size = pos_embed_checkpoint.shape[-1]
+            num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches
+            # height (== width) for the checkpoint position embedding
+            orig_size = int(
+                (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5)
+            # height (== width) for the new position embedding
+            new_size = int(self.num_patches**0.5)
+            # class_token and dist_token are kept unchanged
+            if orig_size != new_size:
+                extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+                # only the position tokens are interpolated
+                pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+                pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
+                                                embedding_size).permute(
+                                                    0, 3, 1, 2)
+                pos_tokens = torch.nn.functional.interpolate(
+                    pos_tokens,
+                    size=(new_size, new_size),
+                    mode='bicubic',
+                    align_corners=False)
+                pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+                new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+                state_dict['pos_embed'] = new_pos_embed
+        return state_dict
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mit.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mit.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b1843d1bdfdb80a6ab76412abfc7080edd12b9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mit.py
@@ -0,0 +1,463 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from mmcv.runner import BaseModule, ModuleList, Sequential
+
+from ..builder import BACKBONES
+from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        # 3x3 depth wise conv to provide positional encode information
+        pe_conv = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=feedforward_channels,
+            kernel_size=3,
+            stride=1,
+            padding=(3 - 1) // 2,
+            bias=True,
+            groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class EfficientMultiheadAttention(MultiheadAttention):
+    """An implementation of Efficient Multi-head Attention of Segformer.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None,
+                 batch_first=True,
+                 qkv_bias=False,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            dropout_layer=dropout_layer,
+            init_cfg=init_cfg,
+            batch_first=batch_first,
+            bias=qkv_bias)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmseg import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'EfficientMultiheadAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # `need_weights=True` will let nn.MultiHeadAttention
+        # `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
+        # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
+        # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
+        # This issue - `https://github.com/pytorch/pytorch/issues/37583` report
+        # the error that large scale tensor sum operation may cause cuda error.
+        out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default:None.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 sr_ratio=1,
+                 with_cp=False):
+        super(TransformerEncoderLayer, self).__init__()
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = EfficientMultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), hw_shape, identity=x)
+            x = self.ffn(self.norm2(x), hw_shape, identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@BACKBONES.register_module()
+class MixVisionTransformer(BaseModule):
+    """The backbone of Segformer.
+
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=False):
+        super(MixVisionTransformer, self).__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        self.with_cp = with_cp
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=patch_sizes[i] // 2,
+                norm_cfg=norm_cfg)
+            layer = ModuleList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            self.layers.append(ModuleList([patch_embed, layer, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super(MixVisionTransformer, self).init_weights()
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v2.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56a961a1ad56470c045ab7f4bdcbc4cb70518e8
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v2.py
@@ -0,0 +1,210 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import InvertedResidual, make_divisible
+
+
+@BACKBONES.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    This backbone is the implementation of
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks
+    <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        strides (Sequence[int], optional): Strides of the first block of each
+            layer. If not specified, default config in ``arch_setting`` will
+            be used.
+        dilations (Sequence[int]): Dilation of each layer.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 3 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks.
+    arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4],
+                     [6, 96, 3], [6, 160, 3], [6, 320, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 strides=(1, 2, 2, 2, 1, 2, 1),
+                 dilations=(1, 1, 1, 1, 1, 1, 1),
+                 out_indices=(1, 2, 4, 6),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV2, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == len(self.arch_settings)
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 7):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 7). But received {index}')
+
+        if frozen_stages not in range(-1, 7):
+            raise ValueError('frozen_stages must be in range(-1, 7). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks = layer_cfg
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+    def make_layer(self, out_channels, num_blocks, stride, dilation,
+                   expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block.
+            dilation (int): Dilation of the first block.
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio.
+        """
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride if i == 0 else 1,
+                    expand_ratio=expand_ratio,
+                    dilation=dilation if i == 0 else 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v3.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3849e10f4e34d9cdfdb279b01ee9fb5e74e4c623
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/mobilenet_v3.py
@@ -0,0 +1,280 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import mmcv
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import Conv2dAdaptivePadding
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import InvertedResidualV3 as InvertedResidual
+
+
+@BACKBONES.register_module()
+class MobileNetV3(BaseModule):
+    """MobileNetV3 backbone.
+
+    This backbone is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {'small', 'large'}.
+            Default: 'small'.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (tuple[int]): Output from which layer.
+            Default: (0, 1, 12).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],  # block0 layer1 os=4
+                  [3, 72, 24, False, 'ReLU', 2],  # block1 layer2 os=8
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],  # block2 layer4 os=16
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],  # block3 layer7 os=16
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],  # block4 layer9 os=32
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'large': [[3, 16, 16, False, 'ReLU', 1],  # block0 layer1 os=2
+                  [3, 64, 24, False, 'ReLU', 2],  # block1 layer2 os=4
+                  [3, 72, 24, False, 'ReLU', 1],
+                  [5, 72, 40, True, 'ReLU', 2],  # block2 layer4 os=8
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [3, 240, 80, False, 'HSwish', 2],  # block3 layer7 os=16
+                  [3, 200, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 480, 112, True, 'HSwish', 1],  # block4 layer11 os=16
+                  [3, 672, 112, True, 'HSwish', 1],
+                  [5, 672, 160, True, 'HSwish', 2],  # block5 layer13 os=32
+                  [5, 960, 160, True, 'HSwish', 1],
+                  [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(0, 1, 12),
+                 frozen_stages=-1,
+                 reduction_factor=1,
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV3, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert arch in self.arch_settings
+        assert isinstance(reduction_factor, int) and reduction_factor > 0
+        assert mmcv.is_tuple_of(out_indices, int)
+        for index in out_indices:
+            if index not in range(0, len(self.arch_settings[arch]) + 2):
+                raise ValueError(
+                    'the item in out_indices must in '
+                    f'range(0, {len(self.arch_settings[arch])+2}). '
+                    f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])+2}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.reduction_factor = reduction_factor
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+
+        # build the first layer (layer0)
+        in_channels = 16
+        layer = ConvModule(
+            in_channels=3,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=dict(type='Conv2dAdaptivePadding'),
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        self.add_module('layer0', layer)
+        layers.append('layer0')
+
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+
+            if self.arch == 'large' and i >= 12 or self.arch == 'small' and \
+                    i >= 8:
+                mid_channels = mid_channels // self.reduction_factor
+                out_channels = out_channels // self.reduction_factor
+
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'),
+                             dict(type='HSigmoid', bias=3.0, divisor=6.0)))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=(in_channels != mid_channels),
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            in_channels = out_channels
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+
+        # build the last layer
+        # block5 layer12 os=32 for small model
+        # block6 layer16 os=32 for large model
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=576 if self.arch == 'small' else 960,
+            kernel_size=1,
+            stride=1,
+            dilation=4,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        layer_name = 'layer{}'.format(len(layer_setting) + 1)
+        self.add_module(layer_name, layer)
+        layers.append(layer_name)
+
+        # next, convert backbone MobileNetV3 to a semantic segmentation version
+        if self.arch == 'small':
+            self.layer4.depthwise_conv.conv.stride = (1, 1)
+            self.layer9.depthwise_conv.conv.stride = (1, 1)
+            for i in range(4, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 9:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+        else:
+            self.layer7.depthwise_conv.conv.stride = (1, 1)
+            self.layer13.depthwise_conv.conv.stride = (1, 1)
+            for i in range(7, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 13:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+
+        return layers
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return outs
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(MobileNetV3, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnest.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6ffd4102fefdd6e3f7fa57fa677c04670a6602
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnest.py
@@ -0,0 +1,331 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None):
+        super(SplitAttentionConv2d, self).__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    This backbone is the implementation of `ResNeSt:
+    Split-Attention Networks <https://arxiv.org/abs/2004.08955>`_.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9149682161192bf7b02aef9d33f51da6791eb3f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnet.py
@@ -0,0 +1,727 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.runner import BaseModule
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    """Basic block for ResNet."""
+
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block for ResNet.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        """Forward function for plugins."""
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    This backbone is the improved implementation of `Deep Residual Learning
+    for Image Recognition <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Number of stem channels. Default: 64.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        num_stages (int): Resnet stages, normally 4. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: (1, 2, 2, 2).
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: (1, 1, 1, 1).
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: 'pytorch'.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): Dictionary to construct and config conv layer.
+            When conv_cfg is None, cfg will be set to dict(type='Conv2d').
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (dict | None): Dictionary to construct and config DCN conv layer.
+            When dcn is not None, conv_cfg must be None. Default: None.
+        stage_with_dcn (Sequence[bool]): Whether to set DCN conv for each
+            stage. The length of stage_with_dcn is equal to num_stages.
+            Default: (False, False, False, False).
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+
+            - position (str, required): Position inside block to insert plugin,
+            options: 'after_conv1', 'after_conv2', 'after_conv3'.
+
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+            should be same as 'num_stages'.
+            Default: None.
+        multi_grid (Sequence[int]|None): Multi grid dilation rates of last
+            stage. Default: None.
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 multi_grid=None,
+                 contract_dilation=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.multi_grid = multi_grid
+        self.contract_dilation = contract_dilation
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            # multi grid is applied to last layer only
+            stage_multi_grid = multi_grid if i == len(
+                self.stage_blocks) - 1 else None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                multi_grid=stage_multi_grid,
+                contract_dilation=contract_dilation,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i+1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """make plugins for ResNet 'stage_idx'th stage .
+
+        Currently we support to insert 'context_block',
+        'empirical_attention_block', 'nonlocal_block' into the backbone like
+        ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be :
+        >>> plugins=[
+        ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+        ...          stages=(False, True, True, True),
+        ...          position='after_conv2'),
+        ...     dict(cfg=dict(type='yyy'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='1'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='2'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3')
+        ... ]
+        >>> self = ResNet(depth=18)
+        >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+        >>> assert len(stage_plugins) == 3
+
+        Suppose 'stage_idx=0', the structure of blocks in the stage would be:
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer for ResNet."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1c(ResNet):
+    """ResNetV1c variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv in
+    the input stem with three 3x3 convs. For more details please refer to `Bag
+    of Tricks for Image Classification with Convolutional Neural Networks
+    <https://arxiv.org/abs/1812.01187>`_.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1c, self).__init__(
+            deep_stem=True, avg_down=False, **kwargs)
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    """ResNetV1d variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnext.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..41dddf754f7e0b792e714f3bfb26fd9e78d50c39
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/resnext.py
@@ -0,0 +1,163 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    This backbone is the implementation of `Aggregated
+    Residual Transformations for Deep Neural
+    Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmseg.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/stdc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/stdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf20ec4c8e8739a0196db343797a9b1ad421c94
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/stdc.py
@@ -0,0 +1,435 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/MichaelFan01/STDC-Seg."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+from .bisenetv1 import AttentionRefinementModule
+
+
+class STDCModule(BaseModule):
+    """STDCModule.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels before scaling.
+        stride (int): The number of stride for the first conv layer.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layers.
+        fusion_type (str): Type of fusion operation. Default: 'add'.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 num_convs=4,
+                 fusion_type='add',
+                 init_cfg=None):
+        super(STDCModule, self).__init__(init_cfg=init_cfg)
+        assert num_convs > 1
+        assert fusion_type in ['add', 'cat']
+        self.stride = stride
+        self.with_downsample = True if self.stride == 2 else False
+        self.fusion_type = fusion_type
+
+        self.layers = ModuleList()
+        conv_0 = ConvModule(
+            in_channels, out_channels // 2, kernel_size=1, norm_cfg=norm_cfg)
+
+        if self.with_downsample:
+            self.downsample = ConvModule(
+                out_channels // 2,
+                out_channels // 2,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=out_channels // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+            if self.fusion_type == 'add':
+                self.layers.append(nn.Sequential(conv_0, self.downsample))
+                self.skip = Sequential(
+                    ConvModule(
+                        in_channels,
+                        in_channels,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        groups=in_channels,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None),
+                    ConvModule(
+                        in_channels,
+                        out_channels,
+                        1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None))
+            else:
+                self.layers.append(conv_0)
+                self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            self.layers.append(conv_0)
+
+        for i in range(1, num_convs):
+            out_factor = 2**(i + 1) if i != num_convs - 1 else 2**i
+            self.layers.append(
+                ConvModule(
+                    out_channels // 2**i,
+                    out_channels // out_factor,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        if self.fusion_type == 'add':
+            out = self.forward_add(inputs)
+        else:
+            out = self.forward_cat(inputs)
+        return out
+
+    def forward_add(self, inputs):
+        layer_outputs = []
+        x = inputs.clone()
+        for layer in self.layers:
+            x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            inputs = self.skip(inputs)
+
+        return torch.cat(layer_outputs, dim=1) + inputs
+
+    def forward_cat(self, inputs):
+        x0 = self.layers[0](inputs)
+        layer_outputs = [x0]
+        for i, layer in enumerate(self.layers[1:]):
+            if i == 0:
+                if self.with_downsample:
+                    x = layer(self.downsample(x0))
+                else:
+                    x = layer(x0)
+            else:
+                x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            layer_outputs[0] = self.skip(x0)
+        return torch.cat(layer_outputs, dim=1)
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module. This module is different from FeatureFusionModule
+    in BiSeNetV1. It uses two ConvModules in `self.attention` whose inter
+    channel number is calculated by given `scale_factor`, while
+    FeatureFusionModule in BiSeNetV1 only uses one ConvModule in
+    `self.conv_atten`.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        scale_factor (int): The number of channel scale factor.
+            Default: 4.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): The activation config for conv layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=4,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+        channels = out_channels // scale_factor
+        self.conv0 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                out_channels,
+                channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=act_cfg),
+            ConvModule(
+                channels,
+                out_channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, spatial_inputs, context_inputs):
+        inputs = torch.cat([spatial_inputs, context_inputs], dim=1)
+        x = self.conv0(inputs)
+        attn = self.attention(x)
+        x_attn = x * attn
+        return x_attn + x
+
+
+@BACKBONES.register_module()
+class STDCNet(BaseModule):
+    """This backbone is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        stdc_type (int): The type of backbone structure,
+            `STDCNet1` and`STDCNet2` denotes two main backbones in paper,
+            whose FLOPs is 813M and 1446M, respectively.
+        in_channels (int): The num of input_channels.
+        channels (tuple[int]): The output channels for each stage.
+        bottleneck_type (str): The type of STDC Module type, the value must
+            be 'add' or 'cat'.
+        norm_cfg (dict): Config dict for normalization layer.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layer at each STDC Module.
+            Default: 4.
+        with_final_conv (bool): Whether add a conv layer at the Module output.
+            Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> import torch
+        >>> stdc_type = 'STDCNet1'
+        >>> in_channels = 3
+        >>> channels = (32, 64, 256, 512, 1024)
+        >>> bottleneck_type = 'cat'
+        >>> inputs = torch.rand(1, 3, 1024, 2048)
+        >>> self = STDCNet(stdc_type, in_channels,
+        ...                 channels, bottleneck_type).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 256, 128, 256])
+        outputs[1].shape = torch.Size([1, 512, 64, 128])
+        outputs[2].shape = torch.Size([1, 1024, 32, 64])
+    """
+
+    arch_settings = {
+        'STDCNet1': [(2, 1), (2, 1), (2, 1)],
+        'STDCNet2': [(2, 1, 1, 1), (2, 1, 1, 1, 1), (2, 1, 1)]
+    }
+
+    def __init__(self,
+                 stdc_type,
+                 in_channels,
+                 channels,
+                 bottleneck_type,
+                 norm_cfg,
+                 act_cfg,
+                 num_convs=4,
+                 with_final_conv=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(STDCNet, self).__init__(init_cfg=init_cfg)
+        assert stdc_type in self.arch_settings, \
+            f'invalid structure {stdc_type} for STDCNet.'
+        assert bottleneck_type in ['add', 'cat'],\
+            f'bottleneck_type must be `add` or `cat`, got {bottleneck_type}'
+
+        assert len(channels) == 5,\
+            f'invalid channels length {len(channels)} for STDCNet.'
+
+        self.in_channels = in_channels
+        self.channels = channels
+        self.stage_strides = self.arch_settings[stdc_type]
+        self.prtrained = pretrained
+        self.num_convs = num_convs
+        self.with_final_conv = with_final_conv
+
+        self.stages = ModuleList([
+            ConvModule(
+                self.in_channels,
+                self.channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                self.channels[0],
+                self.channels[1],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        ])
+        # `self.num_shallow_features` is the number of shallow modules in
+        # `STDCNet`, which is noted as `Stage1` and `Stage2` in original paper.
+        # They are both not used for following modules like Attention
+        # Refinement Module and Feature Fusion Module.
+        # Thus they would be cut from `outs`. Please refer to Figure 4
+        # of original paper for more details.
+        self.num_shallow_features = len(self.stages)
+
+        for strides in self.stage_strides:
+            idx = len(self.stages) - 1
+            self.stages.append(
+                self._make_stage(self.channels[idx], self.channels[idx + 1],
+                                 strides, norm_cfg, act_cfg, bottleneck_type))
+        # After appending, `self.stages` is a ModuleList including several
+        # shallow modules and STDCModules.
+        # (len(self.stages) ==
+        # self.num_shallow_features + len(self.stage_strides))
+        if self.with_final_conv:
+            self.final_conv = ConvModule(
+                self.channels[-1],
+                max(1024, self.channels[-1]),
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def _make_stage(self, in_channels, out_channels, strides, norm_cfg,
+                    act_cfg, bottleneck_type):
+        layers = []
+        for i, stride in enumerate(strides):
+            layers.append(
+                STDCModule(
+                    in_channels if i == 0 else out_channels,
+                    out_channels,
+                    stride,
+                    norm_cfg,
+                    act_cfg,
+                    num_convs=self.num_convs,
+                    fusion_type=bottleneck_type))
+        return Sequential(*layers)
+
+    def forward(self, x):
+        outs = []
+        for stage in self.stages:
+            x = stage(x)
+            outs.append(x)
+        if self.with_final_conv:
+            outs[-1] = self.final_conv(outs[-1])
+        outs = outs[self.num_shallow_features:]
+        return tuple(outs)
+
+
+@BACKBONES.register_module()
+class STDCContextPathNet(BaseModule):
+    """STDCNet with Context Path. The `outs` below is a list of three feature
+    maps from deep to shallow, whose height and width is from small to big,
+    respectively. The biggest feature map of `outs` is outputted for
+    `STDCHead`, where Detail Loss would be calculated by Detail Ground-truth.
+    The other two feature maps are used for Attention Refinement Module,
+    respectively. Besides, the biggest feature map of `outs` and the last
+    output of Attention Refinement Module are concatenated for Feature Fusion
+    Module. Then, this fusion feature map `feat_fuse` would be outputted for
+    `decode_head`. More details please refer to Figure 4 of original paper.
+
+    Args:
+        backbone_cfg (dict): Config dict for stdc backbone.
+        last_in_channels (tuple(int)), The number of channels of last
+            two feature maps from stdc backbone. Default: (1024, 512).
+        out_channels (int): The channels of output feature maps.
+            Default: 128.
+        ffm_cfg (dict): Config dict for Feature Fusion Module. Default:
+            `dict(in_channels=512, out_channels=256, scale_factor=4)`.
+        upsample_mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``.
+        align_corners (str): align_corners argument of F.interpolate. It
+            must be `None` if upsample_mode is ``'nearest'``. Default: None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Return:
+        outputs (tuple): The tuple of list of output feature map for
+            auxiliary heads and decoder head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 last_in_channels=(1024, 512),
+                 out_channels=128,
+                 ffm_cfg=dict(
+                     in_channels=512, out_channels=256, scale_factor=4),
+                 upsample_mode='nearest',
+                 align_corners=None,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None):
+        super(STDCContextPathNet, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone_cfg)
+        self.arms = ModuleList()
+        self.convs = ModuleList()
+        for channels in last_in_channels:
+            self.arms.append(AttentionRefinementModule(channels, out_channels))
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg))
+        self.conv_avg = ConvModule(
+            last_in_channels[0], out_channels, 1, norm_cfg=norm_cfg)
+
+        self.ffm = FeatureFusionModule(**ffm_cfg)
+
+        self.upsample_mode = upsample_mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        outs = list(self.backbone(x))
+        avg = F.adaptive_avg_pool2d(outs[-1], 1)
+        avg_feat = self.conv_avg(avg)
+
+        feature_up = resize(
+            avg_feat,
+            size=outs[-1].shape[2:],
+            mode=self.upsample_mode,
+            align_corners=self.align_corners)
+        arms_out = []
+        for i in range(len(self.arms)):
+            x_arm = self.arms[i](outs[len(outs) - 1 - i]) + feature_up
+            feature_up = resize(
+                x_arm,
+                size=outs[len(outs) - 1 - i - 1].shape[2:],
+                mode=self.upsample_mode,
+                align_corners=self.align_corners)
+            feature_up = self.convs[i](feature_up)
+            arms_out.append(feature_up)
+
+        feat_fuse = self.ffm(outs[0], arms_out[1])
+
+        # The `outputs` has four feature maps.
+        # `outs[0]` is outputted for `STDCHead` auxiliary head.
+        # Two feature maps of `arms_out` are outputted for auxiliary head.
+        # `feat_fuse` is outputted for decoder head.
+        outputs = [outs[0]] + list(arms_out) + [feat_fuse]
+        return tuple(outputs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/swin.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c38ca689ef8e5057270bdf14c890c34c6ecd6a3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/swin.py
@@ -0,0 +1,769 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmcv.cnn.utils.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmcv.runner import (BaseModule, CheckpointLoader, ModuleList,
+                         load_state_dict)
+from mmcv.utils import to_2tuple
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils.embed import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__(init_cfg=init_cfg)
+
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@BACKBONES.register_module()
+class SwinTransformer(BaseModule):
+    """Swin Transformer backbone.
+
+    This backbone is the implementation of `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted
+    Windows <https://arxiv.org/abs/2103.14030>`_.
+    Inspiration from https://github.com/microsoft/Swin-Transformer.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int | float): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LN').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.frozen_stages = frozen_stages
+
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            padding='corner',
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=int(mlp_ratio * in_channels),
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+                else:
+                    state_dict[k] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/timm_backbone.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/timm_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e36514c3b65819143b1b8b80c0ec79c886dd33
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/timm_backbone.py
@@ -0,0 +1,76 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import timm
+except ImportError:
+    timm = None
+
+from mmcv.cnn.bricks.registry import NORM_LAYERS
+from mmcv.runner import BaseModule
+
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class TIMMBackbone(BaseModule):
+    """Wrapper to use backbones from timm library. More details can be found in
+    `timm <https://github.com/rwightman/pytorch-image-models>`_ .
+
+    Args:
+        model_name (str): Name of timm model to instantiate.
+        pretrained (bool): Load pretrained weights if True.
+        checkpoint_path (str): Path of checkpoint to load after
+            model is initialized.
+        in_channels (int): Number of input image channels. Default: 3.
+        init_cfg (dict, optional): Initialization config dict
+        **kwargs: Other timm & model specific arguments.
+    """
+
+    def __init__(
+        self,
+        model_name,
+        features_only=True,
+        pretrained=True,
+        checkpoint_path='',
+        in_channels=3,
+        init_cfg=None,
+        **kwargs,
+    ):
+        if timm is None:
+            raise RuntimeError('timm is not installed')
+        super(TIMMBackbone, self).__init__(init_cfg)
+        if 'norm_layer' in kwargs:
+            kwargs['norm_layer'] = NORM_LAYERS.get(kwargs['norm_layer'])
+        self.timm_model = timm.create_model(
+            model_name=model_name,
+            features_only=features_only,
+            pretrained=pretrained,
+            in_chans=in_channels,
+            checkpoint_path=checkpoint_path,
+            **kwargs,
+        )
+
+        # Make unused parameters None
+        self.timm_model.global_pool = None
+        self.timm_model.fc = None
+        self.timm_model.classifier = None
+
+        # Hack to use pretrained weights from timm
+        if pretrained or checkpoint_path:
+            self._is_init = True
+
+    def forward(self, x):
+        features = self.timm_model(x)
+        return features
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/twins.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/twins.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3636cbdb1c1183dc13401e34969e21524b0f9b2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/twins.py
@@ -0,0 +1,601 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import FFN
+from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from mmcv.runner import BaseModule, ModuleList
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.models.backbones.mit import EfficientMultiheadAttention
+from mmseg.models.builder import BACKBONES
+from ..utils.embed import PatchEmbed
+
+
+class GlobalSubsampledAttention(EfficientMultiheadAttention):
+    """Global Sub-sampled Attention (Spatial Reduction Attention)
+
+    This module is modified from EfficientMultiheadAttention，
+    which is a module from mmseg.models.backbones.mit.py.
+    Specifically, there is no difference between
+    `GlobalSubsampledAttention` and `EfficientMultiheadAttention`,
+    `GlobalSubsampledAttention` is built as a brand new class
+    because it is renamed as `Global sub-sampled attention (GSA)`
+    in paper.
+
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dims)
+            or (n, batch, embed_dims). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of GSA of PCPVT.
+            Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super(GlobalSubsampledAttention, self).__init__(
+            embed_dims,
+            num_heads,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            dropout_layer=dropout_layer,
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio,
+            init_cfg=init_cfg)
+
+
+class GSAEncoderLayer(BaseModule):
+    """Implements one encoder layer with GSA.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (float): Kernel_size of conv in Attention modules. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1.,
+                 init_cfg=None):
+        super(GSAEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = GlobalSubsampledAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape, identity=0.))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class LocallyGroupedSelfAttention(BaseModule):
+    """Locally-grouped Self Attention (LSA) module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        window_size(int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 window_size=1,
+                 init_cfg=None):
+        super(LocallyGroupedSelfAttention, self).__init__(init_cfg=init_cfg)
+
+        assert embed_dims % num_heads == 0, f'dim {embed_dims} should be ' \
+                                            f'divided by num_heads ' \
+                                            f'{num_heads}.'
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_dim = embed_dims // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+        self.window_size = window_size
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        x = x.view(b, h, w, c)
+
+        # pad feature maps to multiples of Local-groups
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - w % self.window_size) % self.window_size
+        pad_b = (self.window_size - h % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+
+        # calculate attention mask for LSA
+        Hp, Wp = x.shape[1:-1]
+        _h, _w = Hp // self.window_size, Wp // self.window_size
+        mask = torch.zeros((1, Hp, Wp), device=x.device)
+        mask[:, -pad_b:, :].fill_(1)
+        mask[:, :, -pad_r:].fill_(1)
+
+        # [B, _h, _w, window_size, window_size, C]
+        x = x.reshape(b, _h, self.window_size, _w, self.window_size,
+                      c).transpose(2, 3)
+        mask = mask.reshape(1, _h, self.window_size, _w,
+                            self.window_size).transpose(2, 3).reshape(
+                                1, _h * _w,
+                                self.window_size * self.window_size)
+        # [1, _h*_w, window_size*window_size, window_size*window_size]
+        attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-1000.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        # [3, B, _w*_h, nhead, window_size*window_size, dim]
+        qkv = self.qkv(x).reshape(b, _h * _w,
+                                  self.window_size * self.window_size, 3,
+                                  self.num_heads, c // self.num_heads).permute(
+                                      3, 0, 1, 4, 2, 5)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # [B, _h*_w, n_head, window_size*window_size, window_size*window_size]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn + attn_mask.unsqueeze(2)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        attn = (attn @ v).transpose(2, 3).reshape(b, _h, _w, self.window_size,
+                                                  self.window_size, c)
+        x = attn.transpose(2, 3).reshape(b, _h * self.window_size,
+                                         _w * self.window_size, c)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :h, :w, :].contiguous()
+
+        x = x.reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LSAEncoderLayer(BaseModule):
+    """Implements one encoder layer in Twins-SVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+           Default: 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+           head_dim ** -0.5 if set. Default: None.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=1,
+                 init_cfg=None):
+
+        super(LSAEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads,
+                                                qkv_bias, qk_scale,
+                                                attn_drop_rate, drop_rate,
+                                                window_size)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class ConditionalPositionEncoding(BaseModule):
+    """The Conditional Position Encoding (CPE) module.
+
+    The CPE is the implementation of 'Conditional Positional Encodings
+    for Vision Transformers <https://arxiv.org/abs/2102.10882>'_.
+
+    Args:
+       in_channels (int): Number of input channels.
+       embed_dims (int): The feature dimension. Default: 768.
+       stride (int): Stride of conv layer. Default: 1.
+    """
+
+    def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None):
+        super(ConditionalPositionEncoding, self).__init__(init_cfg=init_cfg)
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dims,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+            groups=embed_dims)
+        self.stride = stride
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        feat_token = x
+        cnn_feat = feat_token.transpose(1, 2).view(b, c, h, w)
+        if self.stride == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+@BACKBONES.register_module()
+class PCPVT(BaseModule):
+    """The backbone of Twins-PCPVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4, 8].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [3, 4, 6, 3]
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [8, 4, 2, 1].
+        norm_after_stage（bool): Add extra norm. Default False.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 norm_after_stage=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PCPVT, self).__init__(init_cfg=init_cfg)
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = ModuleList()
+        self.position_encoding_drops = ModuleList()
+        self.layers = ModuleList()
+
+        for i in range(len(depths)):
+            self.patch_embeds.append(
+                PatchEmbed(
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dims=embed_dims[i],
+                    conv_type='Conv2d',
+                    kernel_size=patch_sizes[i],
+                    stride=strides[i],
+                    padding='corner',
+                    norm_cfg=norm_cfg))
+
+            self.position_encoding_drops.append(nn.Dropout(p=drop_rate))
+
+        self.position_encodings = ModuleList([
+            ConditionalPositionEncoding(embed_dim, embed_dim)
+            for embed_dim in embed_dims
+        ])
+
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for k in range(len(depths)):
+            _block = ModuleList([
+                GSAEncoderLayer(
+                    embed_dims=embed_dims[k],
+                    num_heads=num_heads[k],
+                    feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[cur + i],
+                    num_fcs=2,
+                    qkv_bias=qkv_bias,
+                    act_cfg=dict(type='GELU'),
+                    norm_cfg=dict(type='LN'),
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.layers.append(_block)
+            cur += depths[k]
+
+        self.norm_name, norm = build_norm_layer(
+            norm_cfg, embed_dims[-1], postfix=1)
+
+        self.out_indices = out_indices
+        self.norm_after_stage = norm_after_stage
+        if self.norm_after_stage:
+            self.norm_list = ModuleList()
+            for dim in embed_dims:
+                self.norm_list.append(build_norm_layer(norm_cfg, dim)[1])
+
+    def init_weights(self):
+        if self.init_cfg is not None:
+            super(PCPVT, self).init_weights()
+        else:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+
+    def forward(self, x):
+        outputs = list()
+
+        b = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, hw_shape = self.patch_embeds[i](x)
+            h, w = hw_shape
+            x = self.position_encoding_drops[i](x)
+            for j, blk in enumerate(self.layers[i]):
+                x = blk(x, hw_shape)
+                if j == 0:
+                    x = self.position_encodings[i](x, hw_shape)
+            if self.norm_after_stage:
+                x = self.norm_list[i](x)
+            x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
+
+            if i in self.out_indices:
+                outputs.append(x)
+
+        return tuple(outputs)
+
+
+@BACKBONES.register_module()
+class SVT(PCPVT):
+    """The backbone of Twins-SVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Dropout rate. Default 0.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.2.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [4, 4, 4].
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [4, 2, 1].
+        windiow_sizes (list): Window size of LSA. Default: [7, 7, 7],
+        input_features_slice（bool): Input features need slice. Default: False.
+        norm_after_stage（bool): Add extra norm. Default False.
+        strides (list): Strides in patch-Embedding modules. Default: (2, 2, 2)
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_cfg=dict(type='LN'),
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 windiow_sizes=[7, 7, 7],
+                 norm_after_stage=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(SVT, self).__init__(in_channels, embed_dims, patch_sizes,
+                                  strides, num_heads, mlp_ratios, out_indices,
+                                  qkv_bias, drop_rate, attn_drop_rate,
+                                  drop_path_rate, norm_cfg, depths, sr_ratios,
+                                  norm_after_stage, pretrained, init_cfg)
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        for k in range(len(depths)):
+            for i in range(depths[k]):
+                if i % 2 == 0:
+                    self.layers[k][i] = \
+                        LSAEncoderLayer(
+                            embed_dims=embed_dims[k],
+                            num_heads=num_heads[k],
+                            feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                            drop_rate=drop_rate,
+                            attn_drop_rate=attn_drop_rate,
+                            drop_path_rate=dpr[sum(depths[:k])+i],
+                            qkv_bias=qkv_bias,
+                            window_size=windiow_sizes[k])
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/unet.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..297cafe0c5a7ded3ed58f8e83506b002d9b22a68
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/unet.py
@@ -0,0 +1,451 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer,
+                      build_norm_layer)
+from mmcv.runner import BaseModule
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmseg.ops import Upsample
+from ..builder import BACKBONES
+from ..utils import UpConvBlock
+
+
+class BasicConvBlock(nn.Module):
+    """Basic convolutional block for UNet.
+
+    This module consists of several plain convolutional layers.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers. Default: 2.
+        stride (int): Whether use stride convolution to downsample
+            the input feature map. If stride=2, it only uses stride convolution
+            in the first convolutional layer to downsample the input feature
+            map. Options are 1 or 2. Default: 1.
+        dilation (int): Whether use dilated convolution to expand the
+            receptive field. Set dilation rate of each convolutional layer and
+            the dilation rate of the first convolutional layer is always 1.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dcn=None,
+                 plugins=None):
+        super(BasicConvBlock, self).__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.with_cp = with_cp
+        convs = []
+        for i in range(num_convs):
+            convs.append(
+                ConvModule(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    dilation=1 if i == 0 else dilation,
+                    padding=1 if i == 0 else dilation,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.convs = nn.Sequential(*convs)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.convs, x)
+        else:
+            out = self.convs(x)
+        return out
+
+
+@UPSAMPLE_LAYERS.register_module()
+class DeconvModule(nn.Module):
+    """Deconvolution upsample module in decoder for UNet (2X upsample).
+
+    This module uses deconvolution to upsample feature map in the decoder
+    of UNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        kernel_size (int): Kernel size of the convolutional layer. Default: 4.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 kernel_size=4,
+                 scale_factor=2):
+        super(DeconvModule, self).__init__()
+
+        assert (kernel_size - scale_factor >= 0) and\
+               (kernel_size - scale_factor) % 2 == 0,\
+               f'kernel_size should be greater than or equal to scale_factor '\
+               f'and (kernel_size - scale_factor) should be even numbers, '\
+               f'while the kernel size is {kernel_size} and scale_factor is '\
+               f'{scale_factor}.'
+
+        stride = scale_factor
+        padding = (kernel_size - scale_factor) // 2
+        self.with_cp = with_cp
+        deconv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+
+        norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+        activate = build_activation_layer(act_cfg)
+        self.deconv_upsamping = nn.Sequential(deconv, norm, activate)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.deconv_upsamping, x)
+        else:
+            out = self.deconv_upsamping(x)
+        return out
+
+
+@UPSAMPLE_LAYERS.register_module()
+class InterpConv(nn.Module):
+    """Interpolation upsample module in decoder for UNet.
+
+    This module uses interpolation to upsample feature map in the decoder
+    of UNet. It consists of one interpolation upsample layer and one
+    convolutional layer. It can be one interpolation upsample layer followed
+    by one convolutional layer (conv_first=False) or one convolutional layer
+    followed by one interpolation upsample layer (conv_first=True).
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        conv_first (bool): Whether convolutional layer or interpolation
+            upsample layer first. Default: False. It means interpolation
+            upsample layer followed by one convolutional layer.
+        kernel_size (int): Kernel size of the convolutional layer. Default: 1.
+        stride (int): Stride of the convolutional layer. Default: 1.
+        padding (int): Padding of the convolutional layer. Default: 1.
+        upsample_cfg (dict): Interpolation config of the upsample layer.
+            Default: dict(
+                scale_factor=2, mode='bilinear', align_corners=False).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 conv_cfg=None,
+                 conv_first=False,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 upsample_cfg=dict(
+                     scale_factor=2, mode='bilinear', align_corners=False)):
+        super(InterpConv, self).__init__()
+
+        self.with_cp = with_cp
+        conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        upsample = Upsample(**upsample_cfg)
+        if conv_first:
+            self.interp_upsample = nn.Sequential(conv, upsample)
+        else:
+            self.interp_upsample = nn.Sequential(upsample, conv)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.interp_upsample, x)
+        else:
+            out = self.interp_upsample(x)
+        return out
+
+
+@BACKBONES.register_module()
+class UNet(BaseModule):
+    """UNet backbone.
+
+    This backbone is the implementation of `U-Net: Convolutional Networks
+    for Biomedical Image Segmentation <https://arxiv.org/abs/1505.04597>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default" 3.
+        base_channels (int): Number of base channels of each stage.
+            The output channels of the first stage. Default: 64.
+        num_stages (int): Number of stages in encoder, normally 5. Default: 5.
+        strides (Sequence[int 1 | 2]): Strides of each stage in encoder.
+            len(strides) is equal to num_stages. Normally the stride of the
+            first stage in encoder is 1. If strides[i]=2, it uses stride
+            convolution to downsample in the correspondence encoder stage.
+            Default: (1, 1, 1, 1, 1).
+        enc_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence encoder stage.
+            Default: (2, 2, 2, 2, 2).
+        dec_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence decoder stage.
+            Default: (2, 2, 2, 2).
+        downsamples (Sequence[int]): Whether use MaxPool to downsample the
+            feature map after the first stage of encoder
+            (stages: [1, num_stages)). If the correspondence encoder stage use
+            stride convolution (strides[i]=2), it will never use MaxPool to
+            downsample, even downsamples[i-1]=True.
+            Default: (True, True, True, True).
+        enc_dilations (Sequence[int]): Dilation rate of each stage in encoder.
+            Default: (1, 1, 1, 1, 1).
+        dec_dilations (Sequence[int]): Dilation rate of each stage in decoder.
+            Default: (1, 1, 1, 1).
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Notice:
+        The input image size should be divisible by the whole downsample rate
+        of the encoder. More detail of the whole downsample rate can be found
+        in UNet._check_input_divisible.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 base_channels=64,
+                 num_stages=5,
+                 strides=(1, 1, 1, 1, 1),
+                 enc_num_convs=(2, 2, 2, 2, 2),
+                 dec_num_convs=(2, 2, 2, 2),
+                 downsamples=(True, True, True, True),
+                 enc_dilations=(1, 1, 1, 1, 1),
+                 dec_dilations=(1, 1, 1, 1),
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 norm_eval=False,
+                 dcn=None,
+                 plugins=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(UNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert len(strides) == num_stages, \
+            'The length of strides should be equal to num_stages, '\
+            f'while the strides is {strides}, the length of '\
+            f'strides is {len(strides)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_num_convs) == num_stages, \
+            'The length of enc_num_convs should be equal to num_stages, '\
+            f'while the enc_num_convs is {enc_num_convs}, the length of '\
+            f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_num_convs) == (num_stages-1), \
+            'The length of dec_num_convs should be equal to (num_stages-1), '\
+            f'while the dec_num_convs is {dec_num_convs}, the length of '\
+            f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(downsamples) == (num_stages-1), \
+            'The length of downsamples should be equal to (num_stages-1), '\
+            f'while the downsamples is {downsamples}, the length of '\
+            f'downsamples is {len(downsamples)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_dilations) == num_stages, \
+            'The length of enc_dilations should be equal to num_stages, '\
+            f'while the enc_dilations is {enc_dilations}, the length of '\
+            f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_dilations) == (num_stages-1), \
+            'The length of dec_dilations should be equal to (num_stages-1), '\
+            f'while the dec_dilations is {dec_dilations}, the length of '\
+            f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        self.num_stages = num_stages
+        self.strides = strides
+        self.downsamples = downsamples
+        self.norm_eval = norm_eval
+        self.base_channels = base_channels
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        for i in range(num_stages):
+            enc_conv_block = []
+            if i != 0:
+                if strides[i] == 1 and downsamples[i - 1]:
+                    enc_conv_block.append(nn.MaxPool2d(kernel_size=2))
+                upsample = (strides[i] != 1 or downsamples[i - 1])
+                self.decoder.append(
+                    UpConvBlock(
+                        conv_block=BasicConvBlock,
+                        in_channels=base_channels * 2**i,
+                        skip_channels=base_channels * 2**(i - 1),
+                        out_channels=base_channels * 2**(i - 1),
+                        num_convs=dec_num_convs[i - 1],
+                        stride=1,
+                        dilation=dec_dilations[i - 1],
+                        with_cp=with_cp,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        upsample_cfg=upsample_cfg if upsample else None,
+                        dcn=None,
+                        plugins=None))
+
+            enc_conv_block.append(
+                BasicConvBlock(
+                    in_channels=in_channels,
+                    out_channels=base_channels * 2**i,
+                    num_convs=enc_num_convs[i],
+                    stride=strides[i],
+                    dilation=enc_dilations[i],
+                    with_cp=with_cp,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    dcn=None,
+                    plugins=None))
+            self.encoder.append((nn.Sequential(*enc_conv_block)))
+            in_channels = base_channels * 2**i
+
+    def forward(self, x):
+        self._check_input_divisible(x)
+        enc_outs = []
+        for enc in self.encoder:
+            x = enc(x)
+            enc_outs.append(x)
+        dec_outs = [x]
+        for i in reversed(range(len(self.decoder))):
+            x = self.decoder[i](enc_outs[i], x)
+            dec_outs.append(x)
+
+        return dec_outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(UNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def _check_input_divisible(self, x):
+        h, w = x.shape[-2:]
+        whole_downsample_rate = 1
+        for i in range(1, self.num_stages):
+            if self.strides[i] == 2 or self.downsamples[i - 1]:
+                whole_downsample_rate *= 2
+        assert (h % whole_downsample_rate == 0) \
+            and (w % whole_downsample_rate == 0),\
+            f'The input image size {(h, w)} should be divisible by the whole '\
+            f'downsample rate {whole_downsample_rate}, when num_stages is '\
+            f'{self.num_stages}, strides is {self.strides}, and downsamples '\
+            f'is {self.downsamples}.'
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/vit.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df334a416f5c0f7ad3733af6c1566d082d8d70b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/backbones/vit.py
@@ -0,0 +1,453 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmcv.runner import (BaseModule, CheckpointLoader, ModuleList,
+                         load_state_dict)
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.ops import resize
+from mmseg.utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import PatchEmbed
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(),
+                 with_cp=False):
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+
+        attn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                batch_first=batch_first,
+                bias=qkv_bias))
+
+        self.build_attn(attn_cfg)
+
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+
+        ffn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                feedforward_channels=feedforward_channels,
+                num_fcs=num_fcs,
+                ffn_drop=drop_rate,
+                dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate)
+                if drop_path_rate > 0 else None,
+                act_cfg=act_cfg))
+        self.build_ffn(ffn_cfg)
+        self.with_cp = with_cp
+
+    def build_attn(self, attn_cfg):
+        self.attn = MultiheadAttention(**attn_cfg)
+
+    def build_ffn(self, ffn_cfg):
+        self.ffn = FFN(**ffn_cfg)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), identity=x)
+            x = self.ffn(self.norm2(x), identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@BACKBONES.register_module()
+class VisionTransformer(BaseModule):
+    """Vision Transformer.
+
+    This backbone is the implementation of `An Image is Worth 16x16 Words:
+    Transformers for Image Recognition at
+    Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        with_cls_token (bool): Whether concatenating class token into image
+            tokens as transformer input. Default: True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            `with_cls_token` must be True. Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Default: bicubic.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 with_cls_token=True,
+                 output_cls_token=False,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 interpolate_mode='bicubic',
+                 num_fcs=2,
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(VisionTransformer, self).__init__(init_cfg=init_cfg)
+
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.interpolate_mode = interpolate_mode
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.pretrained = pretrained
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding='corner',
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None,
+        )
+
+        num_patches = (img_size[0] // patch_size) * \
+            (img_size[1] // patch_size)
+
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dims))
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, num_layers)
+        ]  # stochastic depth decay rule
+
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    batch_first=True))
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def init_weights(self):
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            logger = get_root_logger()
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            else:
+                state_dict = checkpoint
+
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    logger.info(msg=f'Resize the pos_embed shape from '
+                                f'{state_dict["pos_embed"].shape} to '
+                                f'{self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'],
+                        (h // self.patch_size, w // self.patch_size),
+                        (pos_size, pos_size), self.interpolate_mode)
+
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+        elif self.init_cfg is not None:
+            super(VisionTransformer, self).init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            trunc_normal_(self.pos_embed, std=.02)
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def _pos_embeding(self, patched_img, hw_shape, pos_embed):
+        """Positiong embeding method.
+
+        Resize the pos_embed, if the input image size doesn't match
+            the training size.
+        Args:
+            patched_img (torch.Tensor): The patched image, it should be
+                shape of [B, L1, C].
+            hw_shape (tuple): The downsampled image resolution.
+            pos_embed (torch.Tensor): The pos_embed weighs, it should be
+                shape of [B, L2, c].
+        Return:
+            torch.Tensor: The pos encoded image feature.
+        """
+        assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
+            'the shapes of patched_img and pos_embed must be [B, L, C]'
+        x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
+        if x_len != pos_len:
+            if pos_len == (self.img_size[0] // self.patch_size) * (
+                    self.img_size[1] // self.patch_size) + 1:
+                pos_h = self.img_size[0] // self.patch_size
+                pos_w = self.img_size[1] // self.patch_size
+            else:
+                raise ValueError(
+                    'Unexpected shape of pos_embed, got {}.'.format(
+                        pos_embed.shape))
+            pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
+                                              (pos_h, pos_w),
+                                              self.interpolate_mode)
+        return self.drop_after_pos(patched_img + pos_embed)
+
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = pos_shape
+        # keep dim for easy deployment
+        cls_token_weight = pos_embed[:, 0:1]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = resize(
+            pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self._pos_embeding(x, hw_shape, self.pos_embed)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    # Remove class token and reshape token for decoder head
+                    out = x[:, 1:]
+                else:
+                    out = x
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                if self.output_cls_token:
+                    out = [out, x[:, 0]]
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super(VisionTransformer, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/builder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d385b733aa4a9c7cbbd1047071e3fb825f6b9927
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/builder.py
@@ -0,0 +1,62 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.cnn.bricks.registry import ATTENTION as MMCV_ATTENTION
+from mmcv.utils import Registry
+
+MODELS = Registry('models', parent=MMCV_MODELS)
+ATTENTION = Registry('attention', parent=MMCV_ATTENTION)
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+SEGMENTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_segmentor(cfg, train_cfg=None, test_cfg=None):
+    """Build segmentor."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return SEGMENTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f38132080723ed5180a77ab27889c65cc083d955
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/__init__.py
@@ -0,0 +1,53 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ann_head import ANNHead
+from .apc_head import APCHead
+from .aspp_head import ASPPHead
+from .cc_head import CCHead
+from .da_head import DAHead
+from .dm_head import DMHead
+from .dnl_head import DNLHead
+from .dpt_head import DPTHead
+from .ema_head import EMAHead
+from .enc_head import EncHead
+from .fcn_head import FCNHead
+from .fpn_head import FPNHead
+from .gc_head import GCHead
+from .isa_head import ISAHead
+from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
+from .lraspp_head import LRASPPHead
+from .nl_head import NLHead
+from .ocr_head import OCRHead
+from .point_head import PointHead
+from .psa_head import PSAHead
+from .psp_head import PSPHead
+from .segformer_head import SegformerHead
+from .segmenter_mask_head import SegmenterMaskTransformerHead
+from .sep_aspp_head import DepthwiseSeparableASPPHead
+from .sep_fcn_head import DepthwiseSeparableFCNHead
+from .setr_mla_head import SETRMLAHead
+from .setr_up_head import SETRUPHead
+from .stdc_head import STDCHead
+from .uper_head import UPerHead
+
+__all__ = [
+    'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
+    'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
+    'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
+    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
+    'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
+    'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
+    'KernelUpdateHead', 'KernelUpdator'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ann_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ann_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..19011c9bb46031e27fb8a314aac4371b8ebc11e0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ann_head.py
@@ -0,0 +1,259 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from ..builder import HEADS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PPMConcat(nn.ModuleList):
+    """Pyramid Pooling Module that only concat the features of each layer.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+    """
+
+    def __init__(self, pool_scales=(1, 3, 6, 8)):
+        super(PPMConcat, self).__init__(
+            [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales])
+
+    def forward(self, feats):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(feats)
+            ppm_outs.append(ppm_out.view(*feats.shape[:2], -1))
+        concat_outs = torch.cat(ppm_outs, dim=2)
+        return concat_outs
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Make a ANN used SelfAttentionBlock.
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_scale (int): The scale of query feature map.
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, share_key_query, query_scale, key_pool_scales,
+                 conv_cfg, norm_cfg, act_cfg):
+        key_psp = PPMConcat(key_pool_scales)
+        if query_scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=query_scale)
+        else:
+            query_downsample = None
+        super(SelfAttentionBlock, self).__init__(
+            key_in_channels=low_in_channels,
+            query_in_channels=high_in_channels,
+            channels=channels,
+            out_channels=out_channels,
+            share_key_query=share_key_query,
+            query_downsample=query_downsample,
+            key_downsample=key_psp,
+            key_query_num_convs=1,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+
+class AFNB(nn.Module):
+    """Asymmetric Fusion Non-local Block(AFNB)
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+            and query projection.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, query_scales, key_pool_scales, conv_cfg,
+                 norm_cfg, act_cfg):
+        super(AFNB, self).__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=low_in_channels,
+                    high_in_channels=high_in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=False,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            out_channels + high_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, low_feats, high_feats):
+        """Forward function."""
+        priors = [stage(high_feats, low_feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, high_feats], 1))
+        return output
+
+
+class APNB(nn.Module):
+    """Asymmetric Pyramid Non-local Block (APNB)
+
+    Args:
+        in_channels (int): Input channels of key/query feature,
+            which is the key feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, out_channels, query_scales,
+                 key_pool_scales, conv_cfg, norm_cfg, act_cfg):
+        super(APNB, self).__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=in_channels,
+                    high_in_channels=in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=True,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            2 * in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, feats):
+        """Forward function."""
+        priors = [stage(feats, feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, feats], 1))
+        return output
+
+
+@HEADS.register_module()
+class ANNHead(BaseDecodeHead):
+    """Asymmetric Non-local Neural Networks for Semantic Segmentation.
+
+    This head is the implementation of `ANNNet
+    <https://arxiv.org/abs/1908.07678>`_.
+
+    Args:
+        project_channels (int): Projection channels for Nonlocal.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): The pooling scales of key feature map.
+            Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 project_channels,
+                 query_scales=(1, ),
+                 key_pool_scales=(1, 3, 6, 8),
+                 **kwargs):
+        super(ANNHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        assert len(self.in_channels) == 2
+        low_in_channels, high_in_channels = self.in_channels
+        self.project_channels = project_channels
+        self.fusion = AFNB(
+            low_in_channels=low_in_channels,
+            high_in_channels=high_in_channels,
+            out_channels=high_in_channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            high_in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.context = APNB(
+            in_channels=self.channels,
+            out_channels=self.channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        low_feats, high_feats = self._transform_inputs(inputs)
+        output = self.fusion(low_feats, high_feats)
+        output = self.dropout(output)
+        output = self.bottleneck(output)
+        output = self.context(output)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/apc_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/apc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c88b6db4692ebb51fa7c24587b49d624e4a99bc
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/apc_head.py
@@ -0,0 +1,172 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+class ACM(nn.Module):
+    """Adaptive Context Module used in APCNet.
+
+    Args:
+        pool_scale (int): Pooling scale used in Adaptive Context
+            Module to extract region features.
+        fusion (bool): Add one conv to fuse residual feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super(ACM, self).__init__()
+        self.pool_scale = pool_scale
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.pooled_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.global_info = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0)
+
+        self.residual_conv = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale)
+        # [batch_size, channels, h, w]
+        x = self.input_redu_conv(x)
+        # [batch_size, channels, pool_scale, pool_scale]
+        pooled_x = self.pooled_redu_conv(pooled_x)
+        batch_size = x.size(0)
+        # [batch_size, pool_scale * pool_scale, channels]
+        pooled_x = pooled_x.view(batch_size, self.channels,
+                                 -1).permute(0, 2, 1).contiguous()
+        # [batch_size, h * w, pool_scale * pool_scale]
+        affinity_matrix = self.gla(x + resize(
+            self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:])
+                                   ).permute(0, 2, 3, 1).reshape(
+                                       batch_size, -1, self.pool_scale**2)
+        affinity_matrix = F.sigmoid(affinity_matrix)
+        # [batch_size, h * w, channels]
+        z_out = torch.matmul(affinity_matrix, pooled_x)
+        # [batch_size, channels, h * w]
+        z_out = z_out.permute(0, 2, 1).contiguous()
+        # [batch_size, channels, h, w]
+        z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3))
+        z_out = self.residual_conv(z_out)
+        z_out = F.relu(z_out + x)
+        if self.fusion:
+            z_out = self.fusion_conv(z_out)
+
+        return z_out
+
+
+@HEADS.register_module()
+class APCHead(BaseDecodeHead):
+    """Adaptive Pyramid Context Network for Semantic Segmentation.
+
+    This head is the implementation of
+    `APCNet <https://openaccess.thecvf.com/content_CVPR_2019/papers/\
+    He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_\
+    CVPR_2019_paper.pdf>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Adaptive Context
+            Module. Default: (1, 2, 3, 6).
+        fusion (bool): Add one conv to fuse residual feature.
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
+        super(APCHead, self).__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.fusion = fusion
+        acm_modules = []
+        for pool_scale in self.pool_scales:
+            acm_modules.append(
+                ACM(pool_scale,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.acm_modules = nn.ModuleList(acm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        acm_outs = [x]
+        for acm_module in self.acm_modules:
+            acm_outs.append(acm_module(x))
+        acm_outs = torch.cat(acm_outs, dim=1)
+        output = self.bottleneck(acm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/aspp_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c03c75b8dc22b588b6283aaeba8739e749ccac
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/aspp_head.py
@@ -0,0 +1,135 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+class ASPPModule(nn.ModuleList):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module.
+
+    Args:
+        dilations (tuple[int]): Dilation rate of each layer.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg):
+        super(ASPPModule, self).__init__()
+        self.dilations = dilations
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for dilation in dilations:
+            self.append(
+                ConvModule(
+                    self.in_channels,
+                    self.channels,
+                    1 if dilation == 1 else 3,
+                    dilation=dilation,
+                    padding=0 if dilation == 1 else dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def forward(self, x):
+        """Forward function."""
+        aspp_outs = []
+        for aspp_module in self:
+            aspp_outs.append(aspp_module(x))
+
+        return aspp_outs
+
+
+@HEADS.register_module()
+class ASPPHead(BaseDecodeHead):
+    """Rethinking Atrous Convolution for Semantic Image Segmentation.
+
+    This head is the implementation of `DeepLabV3
+    <https://arxiv.org/abs/1706.05587>`_.
+
+    Args:
+        dilations (tuple[int]): Dilation rates for ASPP module.
+            Default: (1, 6, 12, 18).
+    """
+
+    def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
+        super(ASPPHead, self).__init__(**kwargs)
+        assert isinstance(dilations, (list, tuple))
+        self.dilations = dilations
+        self.image_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.aspp_modules = ASPPModule(
+            dilations,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            (len(dilations) + 1) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        feats = self.bottleneck(aspp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cascade_decode_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cascade_decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc94bff70a783ee7dd66ca1dd6a0fe7674aa348
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cascade_decode_head.py
@@ -0,0 +1,71 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta, abstractmethod
+
+from .decode_head import BaseDecodeHead
+
+
+class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
+    """Base class for cascade decode head used in
+    :class:`CascadeEncoderDecoder."""
+
+    def __init__(self, *args, **kwargs):
+        super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs)
+
+    @abstractmethod
+    def forward(self, inputs, prev_output):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
+                      train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs, prev_output)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+
+        return losses
+
+    def forward_test(self, inputs, prev_output, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs, prev_output)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cc_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..334b397bd31374a600d58359fd6962fb8dc9f4d6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/cc_head.py
@@ -0,0 +1,56 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+try:
+    from mmcv.ops import CrissCrossAttention
+except ModuleNotFoundError:
+    CrissCrossAttention = None
+
+
+@HEADS.register_module()
+class CCHead(FCNHead):
+    """CCNet: Criss-Cross Attention for Semantic Segmentation.
+
+    This head is the implementation of `CCNet
+    <https://arxiv.org/abs/1811.11721>`_.
+
+    Args:
+        recurrence (int): Number of recurrence of Criss Cross Attention
+            module. Default: 2.
+    """
+
+    def __init__(self, recurrence=2, **kwargs):
+        if CrissCrossAttention is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'CrissCrossAttention ops')
+        super(CCHead, self).__init__(num_convs=2, **kwargs)
+        self.recurrence = recurrence
+        self.cca = CrissCrossAttention(self.channels)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        for _ in range(self.recurrence):
+            output = self.cca(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/da_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/da_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf93a093479da35041d2ba3819952108b7bfb836
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/da_head.py
@@ -0,0 +1,192 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from torch import nn
+
+from mmseg.core import add_prefix
+from ..builder import HEADS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PAM(_SelfAttentionBlock):
+    """Position Attention Module (PAM)
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+    """
+
+    def __init__(self, in_channels, channels):
+        super(PAM, self).__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=1,
+            key_query_norm=False,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=False,
+            with_out=False,
+            conv_cfg=None,
+            norm_cfg=None,
+            act_cfg=None)
+
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        out = super(PAM, self).forward(x, x)
+
+        out = self.gamma(out) + x
+        return out
+
+
+class CAM(nn.Module):
+    """Channel Attention Module (CAM)"""
+
+    def __init__(self):
+        super(CAM, self).__init__()
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        batch_size, channels, height, width = x.size()
+        proj_query = x.view(batch_size, channels, -1)
+        proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1)
+        energy = torch.bmm(proj_query, proj_key)
+        energy_new = torch.max(
+            energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = F.softmax(energy_new, dim=-1)
+        proj_value = x.view(batch_size, channels, -1)
+
+        out = torch.bmm(attention, proj_value)
+        out = out.view(batch_size, channels, height, width)
+
+        out = self.gamma(out) + x
+        return out
+
+
+@HEADS.register_module()
+class DAHead(BaseDecodeHead):
+    """Dual Attention Network for Scene Segmentation.
+
+    This head is the implementation of `DANet
+    <https://arxiv.org/abs/1809.02983>`_.
+
+    Args:
+        pam_channels (int): The channels of Position Attention Module(PAM).
+    """
+
+    def __init__(self, pam_channels, **kwargs):
+        super(DAHead, self).__init__(**kwargs)
+        self.pam_channels = pam_channels
+        self.pam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam = PAM(self.channels, pam_channels)
+        self.pam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+        self.cam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam = CAM()
+        self.cam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+    def pam_cls_seg(self, feat):
+        """PAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.pam_conv_seg(feat)
+        return output
+
+    def cam_cls_seg(self, feat):
+        """CAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.cam_conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        pam_feat = self.pam_in_conv(x)
+        pam_feat = self.pam(pam_feat)
+        pam_feat = self.pam_out_conv(pam_feat)
+        pam_out = self.pam_cls_seg(pam_feat)
+
+        cam_feat = self.cam_in_conv(x)
+        cam_feat = self.cam(cam_feat)
+        cam_feat = self.cam_out_conv(cam_feat)
+        cam_out = self.cam_cls_seg(cam_feat)
+
+        feat_sum = pam_feat + cam_feat
+        pam_cam_out = self.cls_seg(feat_sum)
+
+        return pam_cam_out, pam_out, cam_out
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing, only ``pam_cam`` is used."""
+        return self.forward(inputs)[0]
+
+    def losses(self, seg_logit, seg_label):
+        """Compute ``pam_cam``, ``pam``, ``cam`` loss."""
+        pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit
+        loss = dict()
+        loss.update(
+            add_prefix(
+                super(DAHead, self).losses(pam_cam_seg_logit, seg_label),
+                'pam_cam'))
+        loss.update(
+            add_prefix(
+                super(DAHead, self).losses(pam_seg_logit, seg_label), 'pam'))
+        loss.update(
+            add_prefix(
+                super(DAHead, self).losses(cam_seg_logit, seg_label), 'cam'))
+        return loss
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/decode_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..40d716a7907afefe457e8729f21fa19a573c4773
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/decode_head.py
@@ -0,0 +1,308 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+from mmseg.core import build_pixel_sampler
+from mmseg.ops import resize
+from ..builder import build_loss
+from ..losses import accuracy
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        out_channels (int): Output channels of conv_seg.
+        threshold (float): Threshold for binary segmentation in the case of
+            `num_classes==1`. Default: None.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 out_channels=None,
+                 threshold=None,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super(BaseDecodeHead, self).__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if out_channels is None:
+            if num_classes == 2:
+                warnings.warn('For binary segmentation, we suggest using'
+                              '`out_channels = 1` to define the output'
+                              'channels of segmentor, and use `threshold`'
+                              'to convert seg_logist into a prediction'
+                              'applying a threshold')
+            out_channels = num_classes
+
+        if out_channels != num_classes and out_channels != 1:
+            raise ValueError(
+                'out_channels should be equal to num_classes,'
+                'except binary segmentation set out_channels == 1 and'
+                f'num_classes == 2, but got out_channels={out_channels}'
+                f'and num_classes={num_classes}')
+
+        if out_channels == 1 and threshold is None:
+            threshold = 0.3
+            warnings.warn('threshold is not defined for binary, and defaults'
+                          'to 0.3')
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.threshold = threshold
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    @force_fp32(apply_to=('seg_logit', ))
+    def losses(self, seg_logit, seg_label):
+        """Compute segmentation loss."""
+        loss = dict()
+        seg_logit = resize(
+            input=seg_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logit, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dm_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad6717dc1d0b1d356b7519629298a8a0626f8d2f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dm_head.py
@@ -0,0 +1,154 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+class DCM(nn.Module):
+    """Dynamic Convolutional Module used in DMNet.
+
+    Args:
+        filter_size (int): The filter size of generated convolution kernel
+            used in Dynamic Convolutional Module.
+        fusion (bool): Add one conv to fuse DCM output feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super(DCM, self).__init__()
+        self.filter_size = filter_size
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1,
+                                         0)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.norm_cfg is not None:
+            self.norm = build_norm_layer(self.norm_cfg, self.channels)[1]
+        else:
+            self.norm = None
+        self.activate = build_activation_layer(self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        generated_filter = self.filter_gen_conv(
+            F.adaptive_avg_pool2d(x, self.filter_size))
+        x = self.input_redu_conv(x)
+        b, c, h, w = x.shape
+        # [1, b * c, h, w], c = self.channels
+        x = x.view(1, b * c, h, w)
+        # [b * c, 1, filter_size, filter_size]
+        generated_filter = generated_filter.view(b * c, 1, self.filter_size,
+                                                 self.filter_size)
+        pad = (self.filter_size - 1) // 2
+        if (self.filter_size - 1) % 2 == 0:
+            p2d = (pad, pad, pad, pad)
+        else:
+            p2d = (pad + 1, pad, pad + 1, pad)
+        x = F.pad(input=x, pad=p2d, mode='constant', value=0)
+        # [1, b * c, h, w]
+        output = F.conv2d(input=x, weight=generated_filter, groups=b * c)
+        # [b, c, h, w]
+        output = output.view(b, c, h, w)
+        if self.norm is not None:
+            output = self.norm(output)
+        output = self.activate(output)
+
+        if self.fusion:
+            output = self.fusion_conv(output)
+
+        return output
+
+
+@HEADS.register_module()
+class DMHead(BaseDecodeHead):
+    """Dynamic Multi-scale Filters for Semantic Segmentation.
+
+    This head is the implementation of
+    `DMNet <https://openaccess.thecvf.com/content_ICCV_2019/papers/\
+        He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_\
+            ICCV_2019_paper.pdf>`_.
+
+    Args:
+        filter_sizes (tuple[int]): The size of generated convolutional filters
+            used in Dynamic Convolutional Module. Default: (1, 3, 5, 7).
+        fusion (bool): Add one conv to fuse DCM output feature.
+    """
+
+    def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
+        super(DMHead, self).__init__(**kwargs)
+        assert isinstance(filter_sizes, (list, tuple))
+        self.filter_sizes = filter_sizes
+        self.fusion = fusion
+        dcm_modules = []
+        for filter_size in self.filter_sizes:
+            dcm_modules.append(
+                DCM(filter_size,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.dcm_modules = nn.ModuleList(dcm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(filter_sizes) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        dcm_outs = [x]
+        for dcm_module in self.dcm_modules:
+            dcm_outs.append(dcm_module(x))
+        dcm_outs = torch.cat(dcm_outs, dim=1)
+        output = self.bottleneck(dcm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dnl_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dnl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d21158f4d8e07a27588458f9b0a2a9c42840480
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dnl_head.py
@@ -0,0 +1,150 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from mmcv.cnn import NonLocal2d
+from torch import nn
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+class DisentangledNonLocal2d(NonLocal2d):
+    """Disentangled Non-Local Blocks.
+
+    Args:
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self, *arg, temperature, **kwargs):
+        super().__init__(*arg, **kwargs)
+        self.temperature = temperature
+        self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1)
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        """Embedded gaussian with temperature."""
+
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= torch.tensor(
+                theta_x.shape[-1],
+                dtype=torch.float,
+                device=pairwise_weight.device)**torch.tensor(
+                    0.5, device=pairwise_weight.device)
+        pairwise_weight /= torch.tensor(
+            self.temperature, device=pairwise_weight.device)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def forward(self, x):
+        # x: [N, C, H, W]
+        n = x.size(0)
+
+        # g_x: [N, HxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        # subtract mean
+        theta_x -= theta_x.mean(dim=-2, keepdim=True)
+        phi_x -= phi_x.mean(dim=-1, keepdim=True)
+
+        pairwise_func = getattr(self, self.mode)
+        # pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # y: [N, HxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # y: [N, C, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        # unary_mask: [N, 1, HxW]
+        unary_mask = self.conv_mask(x)
+        unary_mask = unary_mask.view(n, 1, -1)
+        unary_mask = unary_mask.softmax(dim=-1)
+        # unary_x: [N, 1, C]
+        unary_x = torch.matmul(unary_mask, g_x)
+        # unary_x: [N, C, 1, 1]
+        unary_x = unary_x.permute(0, 2, 1).contiguous().reshape(
+            n, self.inter_channels, 1, 1)
+
+        output = x + self.conv_out(y + unary_x)
+
+        return output
+
+
+@HEADS.register_module()
+class DNLHead(FCNHead):
+    """Disentangled Non-Local Neural Networks.
+
+    This head is the implementation of `DNLNet
+    <https://arxiv.org/abs/2006.06668>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 temperature=0.05,
+                 **kwargs):
+        super(DNLHead, self).__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.temperature = temperature
+        self.dnl_block = DisentangledNonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode,
+            temperature=self.temperature)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.dnl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dpt_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dpt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..37671da83c61ead41f7cd6e792b2a59236cddd0d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/dpt_head.py
@@ -0,0 +1,307 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Linear, build_activation_layer
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+class ReassembleBlocks(BaseModule):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=768,
+                 out_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 init_cfg=None):
+        super(ReassembleBlocks, self).__init__(init_cfg)
+
+        assert readout_type in ['ignore', 'add', 'project']
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+
+        self.projects = nn.ModuleList([
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                act_cfg=None,
+            ) for out_channel in out_channels
+        ])
+
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if self.readout_type == 'project':
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        Linear(2 * in_channels, in_channels),
+                        build_activation_layer(dict(type='GELU'))))
+
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == 'project':
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == 'add':
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+
+
+class PreActResidualConvUnit(BaseModule):
+    """ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_cfg (dict): dictionary to construct and config activation layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 stride=1,
+                 dilation=1,
+                 init_cfg=None):
+        super(PreActResidualConvUnit, self).__init__(init_cfg)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+
+
+class FeatureFusionBlock(BaseModule):
+    """FeatureFusionBlock, merge feature map from different stages.
+
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config for ResidualConvUnit.
+        norm_cfg (dict): Config dict for normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 expand=False,
+                 align_corners=True,
+                 init_cfg=None):
+        super(FeatureFusionBlock, self).__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.project = ConvModule(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=1,
+            act_cfg=None,
+            bias=True)
+
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(
+                    inputs[1],
+                    size=(x.shape[2], x.shape[3]),
+                    mode='bilinear',
+                    align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(
+            x,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+
+
+@HEADS.register_module()
+class DPTHead(BaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+        act_cfg (dict): The activation config for residual conv unit.
+            Default dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self,
+                 embed_dims=768,
+                 post_process_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 expand_channels=False,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        super(DPTHead, self).__init__(**kwargs)
+
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims,
+                                                  post_process_channels,
+                                                  readout_type, patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel
+            for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(
+                ConvModule(
+                    channel,
+                    self.channels,
+                    kernel_size=3,
+                    padding=1,
+                    act_cfg=None,
+                    bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(
+                FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = self._transform_inputs(inputs)
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.cls_seg(out)
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ema_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ema_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d2f2e3d5184699ae3d7b426d145becf6c9118a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ema_head.py
@@ -0,0 +1,182 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+def reduce_mean(tensor):
+    """Reduce mean when distributed training."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+class EMAModule(nn.Module):
+    """Expectation Maximization Attention Module used in EMANet.
+
+    Args:
+        channels (int): Channels of the whole module.
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+    """
+
+    def __init__(self, channels, num_bases, num_stages, momentum):
+        super(EMAModule, self).__init__()
+        assert num_stages >= 1, 'num_stages must be at least 1!'
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.momentum = momentum
+
+        bases = torch.zeros(1, channels, self.num_bases)
+        bases.normal_(0, math.sqrt(2. / self.num_bases))
+        # [1, channels, num_bases]
+        bases = F.normalize(bases, dim=1, p=2)
+        self.register_buffer('bases', bases)
+
+    def forward(self, feats):
+        """Forward function."""
+        batch_size, channels, height, width = feats.size()
+        # [batch_size, channels, height*width]
+        feats = feats.view(batch_size, channels, height * width)
+        # [batch_size, channels, num_bases]
+        bases = self.bases.repeat(batch_size, 1, 1)
+
+        with torch.no_grad():
+            for i in range(self.num_stages):
+                # [batch_size, height*width, num_bases]
+                attention = torch.einsum('bcn,bck->bnk', feats, bases)
+                attention = F.softmax(attention, dim=2)
+                # l1 norm
+                attention_normed = F.normalize(attention, dim=1, p=1)
+                # [batch_size, channels, num_bases]
+                bases = torch.einsum('bcn,bnk->bck', feats, attention_normed)
+                # l2 norm
+                bases = F.normalize(bases, dim=1, p=2)
+
+        feats_recon = torch.einsum('bck,bnk->bcn', bases, attention)
+        feats_recon = feats_recon.view(batch_size, channels, height, width)
+
+        if self.training:
+            bases = bases.mean(dim=0, keepdim=True)
+            bases = reduce_mean(bases)
+            # l2 norm
+            bases = F.normalize(bases, dim=1, p=2)
+            self.bases = (1 -
+                          self.momentum) * self.bases + self.momentum * bases
+
+        return feats_recon
+
+
+@HEADS.register_module()
+class EMAHead(BaseDecodeHead):
+    """Expectation Maximization Attention Networks for Semantic Segmentation.
+
+    This head is the implementation of `EMANet
+    <https://arxiv.org/abs/1907.13426>`_.
+
+    Args:
+        ema_channels (int): EMA module channels
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer. Default: True
+        momentum (float): Momentum to update the base. Default: 0.1.
+    """
+
+    def __init__(self,
+                 ema_channels,
+                 num_bases,
+                 num_stages,
+                 concat_input=True,
+                 momentum=0.1,
+                 **kwargs):
+        super(EMAHead, self).__init__(**kwargs)
+        self.ema_channels = ema_channels
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.concat_input = concat_input
+        self.momentum = momentum
+        self.ema_module = EMAModule(self.ema_channels, self.num_bases,
+                                    self.num_stages, self.momentum)
+
+        self.ema_in_conv = ConvModule(
+            self.in_channels,
+            self.ema_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # project (0, inf) -> (-inf, inf)
+        self.ema_mid_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=None,
+            act_cfg=None)
+        for param in self.ema_mid_conv.parameters():
+            param.requires_grad = False
+
+        self.ema_out_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.bottleneck = ConvModule(
+            self.ema_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.ema_in_conv(x)
+        identity = feats
+        feats = self.ema_mid_conv(feats)
+        recon = self.ema_module(feats)
+        recon = F.relu(recon, inplace=True)
+        recon = self.ema_out_conv(recon)
+        output = F.relu(identity + recon, inplace=True)
+        output = self.bottleneck(output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/enc_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/enc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..42eff389cf2ba4a5c63500889912e6f1d0d80156
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/enc_head.py
@@ -0,0 +1,201 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.ops import Encoding, resize
+from ..builder import HEADS, build_loss
+from .decode_head import BaseDecodeHead
+
+
+class EncModule(nn.Module):
+    """Encoding Module used in EncNet.
+
+    Args:
+        in_channels (int): Input channels.
+        num_codes (int): Number of code words.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
+        super(EncModule, self).__init__()
+        self.encoding_project = ConvModule(
+            in_channels,
+            in_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # TODO: resolve this hack
+        # change to 1d
+        if norm_cfg is not None:
+            encoding_norm_cfg = norm_cfg.copy()
+            if encoding_norm_cfg['type'] in ['BN', 'IN']:
+                encoding_norm_cfg['type'] += '1d'
+            else:
+                encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace(
+                    '2d', '1d')
+        else:
+            # fallback to BN1d
+            encoding_norm_cfg = dict(type='BN1d')
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            build_norm_layer(encoding_norm_cfg, num_codes)[1],
+            nn.ReLU(inplace=True))
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels), nn.Sigmoid())
+
+    def forward(self, x):
+        """Forward function."""
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection).mean(dim=1)
+        batch_size, channels, _, _ = x.size()
+        gamma = self.fc(encoding_feat)
+        y = gamma.view(batch_size, channels, 1, 1)
+        output = F.relu_(x + x * y)
+        return encoding_feat, output
+
+
+@HEADS.register_module()
+class EncHead(BaseDecodeHead):
+    """Context Encoding for Semantic Segmentation.
+
+    This head is the implementation of `EncNet
+    <https://arxiv.org/abs/1803.08904>`_.
+
+    Args:
+        num_codes (int): Number of code words. Default: 32.
+        use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to
+            regularize the training. Default: True.
+        add_lateral (bool): Whether use lateral connection to fuse features.
+            Default: False.
+        loss_se_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+    """
+
+    def __init__(self,
+                 num_codes=32,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 loss_se_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=0.2),
+                 **kwargs):
+        super(EncHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        self.use_se_loss = use_se_loss
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.bottleneck = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if add_lateral:
+            self.lateral_convs = nn.ModuleList()
+            for in_channels in self.in_channels[:-1]:  # skip the last one
+                self.lateral_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.fusion = ConvModule(
+                len(self.in_channels) * self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        self.enc_module = EncModule(
+            self.channels,
+            num_codes=num_codes,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.use_se_loss:
+            self.loss_se_decode = build_loss(loss_se_decode)
+            self.se_layer = nn.Linear(self.channels, self.num_classes)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+        feat = self.bottleneck(inputs[-1])
+        if self.add_lateral:
+            laterals = [
+                resize(
+                    lateral_conv(inputs[i]),
+                    size=feat.shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners)
+                for i, lateral_conv in enumerate(self.lateral_convs)
+            ]
+            feat = self.fusion(torch.cat([feat, *laterals], 1))
+        encode_feat, output = self.enc_module(feat)
+        output = self.cls_seg(output)
+        if self.use_se_loss:
+            se_output = self.se_layer(encode_feat)
+            return output, se_output
+        else:
+            return output
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing, ignore se_loss."""
+        if self.use_se_loss:
+            return self.forward(inputs)[0]
+        else:
+            return self.forward(inputs)
+
+    @staticmethod
+    def _convert_to_onehot_labels(seg_label, num_classes):
+        """Convert segmentation label to onehot.
+
+        Args:
+            seg_label (Tensor): Segmentation label of shape (N, H, W).
+            num_classes (int): Number of classes.
+
+        Returns:
+            Tensor: Onehot labels of shape (N, num_classes).
+        """
+
+        batch_size = seg_label.size(0)
+        onehot_labels = seg_label.new_zeros((batch_size, num_classes))
+        for i in range(batch_size):
+            hist = seg_label[i].float().histc(
+                bins=num_classes, min=0, max=num_classes - 1)
+            onehot_labels[i] = hist > 0
+        return onehot_labels
+
+    def losses(self, seg_logit, seg_label):
+        """Compute segmentation and semantic encoding loss."""
+        seg_logit, se_seg_logit = seg_logit
+        loss = dict()
+        loss.update(super(EncHead, self).losses(seg_logit, seg_label))
+        se_loss = self.loss_se_decode(
+            se_seg_logit,
+            self._convert_to_onehot_labels(seg_label, self.num_classes))
+        loss['loss_se'] = se_loss
+        return loss
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fcn_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7e19e5bc87d9cefa0c0b6342bb1938f3947bfd6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fcn_head.py
@@ -0,0 +1,109 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class FCNHead(BaseDecodeHead):
+    """Fully Convolution Networks for Semantic Segmentation.
+
+    This head is implemented of `FCNNet <https://arxiv.org/abs/1411.4038>`_.
+
+    Args:
+        num_convs (int): Number of convs in the head. Default: 2.
+        kernel_size (int): The kernel size for convs in the head. Default: 3.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer.
+        dilation (int): The dilation rate for convs in the head. Default: 1.
+    """
+
+    def __init__(self,
+                 num_convs=2,
+                 kernel_size=3,
+                 concat_input=True,
+                 dilation=1,
+                 **kwargs):
+        assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int)
+        self.num_convs = num_convs
+        self.concat_input = concat_input
+        self.kernel_size = kernel_size
+        super(FCNHead, self).__init__(**kwargs)
+        if num_convs == 0:
+            assert self.in_channels == self.channels
+
+        conv_padding = (kernel_size // 2) * dilation
+        convs = []
+        convs.append(
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=conv_padding,
+                dilation=dilation,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        for i in range(num_convs - 1):
+            convs.append(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=kernel_size,
+                    padding=conv_padding,
+                    dilation=dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        if num_convs == 0:
+            self.convs = nn.Identity()
+        else:
+            self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        feats = self.convs(x)
+        if self.concat_input:
+            feats = self.conv_cat(torch.cat([x, feats], dim=1))
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fpn_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c292aef30f3e4cf0593ce3c04bbdb246d7379f2f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/fpn_head.py
@@ -0,0 +1,82 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import Upsample, resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class FPNHead(BaseDecodeHead):
+    """Panoptic Feature Pyramid Networks.
+
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self, feature_strides, **kwargs):
+        super(FPNHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+    def forward(self, inputs):
+
+        x = self._transform_inputs(inputs)
+
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/gc_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/gc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f25afe8ec7c956328b3144fd014c24521623277e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/gc_head.py
@@ -0,0 +1,61 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from mmcv.cnn import ContextBlock
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+@HEADS.register_module()
+class GCHead(FCNHead):
+    """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond.
+
+    This head is the implementation of `GCNet
+    <https://arxiv.org/abs/1904.11492>`_.
+
+    Args:
+        ratio (float): Multiplier of channels ratio. Default: 1/4.
+        pooling_type (str): The pooling type of context aggregation.
+            Options are 'att', 'avg'. Default: 'avg'.
+        fusion_types (tuple[str]): The fusion type for feature fusion.
+            Options are 'channel_add', 'channel_mul'. Default: ('channel_add',)
+    """
+
+    def __init__(self,
+                 ratio=1 / 4.,
+                 pooling_type='att',
+                 fusion_types=('channel_add', ),
+                 **kwargs):
+        super(GCHead, self).__init__(num_convs=2, **kwargs)
+        self.ratio = ratio
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        self.gc_block = ContextBlock(
+            in_channels=self.channels,
+            ratio=self.ratio,
+            pooling_type=self.pooling_type,
+            fusion_types=self.fusion_types)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.gc_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/isa_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/isa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eadcac145d75f0a97fb671ae190da9f02dceaf5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/isa_head.py
@@ -0,0 +1,156 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import HEADS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Self-Attention Module.
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict | None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, conv_cfg, norm_cfg, act_cfg):
+        super(SelfAttentionBlock, self).__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.output_project = self.build_project(
+            in_channels,
+            in_channels,
+            num_convs=1,
+            use_conv_module=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        context = super(SelfAttentionBlock, self).forward(x, x)
+        return self.output_project(context)
+
+
+@HEADS.register_module()
+class ISAHead(BaseDecodeHead):
+    """Interlaced Sparse Self-Attention for Semantic Segmentation.
+
+    This head is the implementation of `ISA
+    <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple[int]): The local group size of ISA.
+    """
+
+    def __init__(self, isa_channels, down_factor=(8, 8), **kwargs):
+        super(ISAHead, self).__init__(**kwargs)
+        self.down_factor = down_factor
+
+        self.in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.global_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.local_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.out_conv = ConvModule(
+            self.channels * 2,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x_ = self._transform_inputs(inputs)
+        x = self.in_conv(x_)
+        residual = x
+
+        n, c, h, w = x.size()
+        loc_h, loc_w = self.down_factor  # size of local group in H- and W-axes
+        glb_h, glb_w = math.ceil(h / loc_h), math.ceil(w / loc_w)
+        pad_h, pad_w = glb_h * loc_h - h, glb_w * loc_w - w
+        if pad_h > 0 or pad_w > 0:  # pad if the size is not divisible
+            padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                       pad_h - pad_h // 2)
+            x = F.pad(x, padding)
+
+        # global relation
+        x = x.view(n, c, glb_h, loc_h, glb_w, loc_w)
+        # do permutation to gather global group
+        x = x.permute(0, 3, 5, 1, 2, 4)  # (n, loc_h, loc_w, c, glb_h, glb_w)
+        x = x.reshape(-1, c, glb_h, glb_w)
+        # apply attention within each global group
+        x = self.global_relation(x)  # (n * loc_h * loc_w, c, glb_h, glb_w)
+
+        # local relation
+        x = x.view(n, loc_h, loc_w, c, glb_h, glb_w)
+        # do permutation to gather local group
+        x = x.permute(0, 4, 5, 3, 1, 2)  # (n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.reshape(-1, c, loc_h, loc_w)
+        # apply attention within each local group
+        x = self.local_relation(x)  # (n * glb_h * glb_w, c, loc_h, loc_w)
+
+        # permute each pixel back to its original position
+        x = x.view(n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.permute(0, 3, 1, 4, 2, 5)  # (n, c, glb_h, loc_h, glb_w, loc_w)
+        x = x.reshape(n, c, glb_h * loc_h, glb_w * loc_w)
+        if pad_h > 0 or pad_w > 0:  # remove padding
+            x = x[:, :, pad_h // 2:pad_h // 2 + h, pad_w // 2:pad_w // 2 + w]
+
+        x = self.out_conv(torch.cat([x, residual], dim=1))
+        out = self.cls_seg(x)
+
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/knet_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/knet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..38aeefa514c727cfa2fb246d2658cd3c813abd39
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/knet_head.py
@@ -0,0 +1,466 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import (FFN, TRANSFORMER_LAYER,
+                                         MultiheadAttention,
+                                         build_transformer_layer)
+
+from mmseg.models.builder import HEADS, build_head
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.utils import get_root_logger
+
+
+@TRANSFORMER_LAYER.register_module()
+class KernelUpdator(nn.Module):
+    """Dynamic Kernel Updator in Kernel Update Head.
+
+    Args:
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        feat_channels (int): The number of middle-stage channels in
+            the kernel updator. Default: 64.
+        out_channels (int): The number of output channels.
+        gate_sigmoid (bool): Whether use sigmoid function in gate
+            mechanism. Default: True.
+        gate_norm_act (bool): Whether add normalization and activation
+            layer in gate mechanism. Default: False.
+        activate_out: Whether add activation after gate mechanism.
+            Default: False.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='LN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(
+            self,
+            in_channels=256,
+            feat_channels=64,
+            out_channels=None,
+            gate_sigmoid=True,
+            gate_norm_act=False,
+            activate_out=False,
+            norm_cfg=dict(type='LN'),
+            act_cfg=dict(type='ReLU', inplace=True),
+    ):
+        super(KernelUpdator, self).__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.gate_sigmoid = gate_sigmoid
+        self.gate_norm_act = gate_norm_act
+        self.activate_out = activate_out
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.feat_channels
+        self.num_params_out = self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+        self.input_layer = nn.Linear(self.in_channels,
+                                     self.num_params_in + self.num_params_out,
+                                     1)
+        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        if self.gate_norm_act:
+            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
+        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, update_feature, input_feature):
+        """Forward function of KernelUpdator.
+
+        Args:
+            update_feature (torch.Tensor): Feature map assembled from
+                each group. It would be reshaped with last dimension
+                shape: `self.in_channels`.
+            input_feature (torch.Tensor): Intermediate feature
+                with shape: (N, num_classes, conv_kernel_size**2, channels).
+        Returns:
+            Tensor: The output tensor of shape (N*C1/C2, K*K, C2), where N is
+            the number of classes, C1 and C2 are the feature map channels of
+            KernelUpdateHead and KernelUpdator, respectively.
+        """
+
+        update_feature = update_feature.reshape(-1, self.in_channels)
+        num_proposals = update_feature.size(0)
+        # dynamic_layer works for
+        # phi_1 and psi_3 in Eq.(4) and (5) of K-Net paper
+        parameters = self.dynamic_layer(update_feature)
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels)
+
+        # input_layer works for
+        # phi_2 and psi_4 in Eq.(4) and (5) of K-Net paper
+        input_feats = self.input_layer(
+            input_feature.reshape(num_proposals, -1, self.feat_channels))
+        input_in = input_feats[..., :self.num_params_in]
+        input_out = input_feats[..., -self.num_params_out:]
+
+        # `gate_feats` is F^G in K-Net paper
+        gate_feats = input_in * param_in.unsqueeze(-2)
+        if self.gate_norm_act:
+            gate_feats = self.activation(self.gate_norm(gate_feats))
+
+        input_gate = self.input_norm_in(self.input_gate(gate_feats))
+        update_gate = self.norm_in(self.update_gate(gate_feats))
+        if self.gate_sigmoid:
+            input_gate = input_gate.sigmoid()
+            update_gate = update_gate.sigmoid()
+        param_out = self.norm_out(param_out)
+        input_out = self.input_norm_out(input_out)
+
+        if self.activate_out:
+            param_out = self.activation(param_out)
+            input_out = self.activation(input_out)
+
+        # Gate mechanism. Eq.(5) in original paper.
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = update_gate * param_out.unsqueeze(
+            -2) + input_gate * input_out
+
+        features = self.fc_layer(features)
+        features = self.fc_norm(features)
+        features = self.activation(features)
+
+        return features
+
+
+@HEADS.register_module()
+class KernelUpdateHead(nn.Module):
+    """Kernel Update Head in K-Net.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        num_ffn_fcs (int): The number of fully-connected layers in
+            FFNs. Default: 2.
+        num_heads (int): The number of parallel attention heads.
+            Default: 8.
+        num_mask_fcs (int): The number of fully connected layers for
+            mask prediction. Default: 3.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 2048.
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        out_channels (int): The number of output channels.
+            Default: 256.
+        dropout (float): The Probability of an element to be
+            zeroed in MultiheadAttention and FFN. Default 0.0.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        ffn_act_cfg (dict): Config of activation layers in FFN.
+            Default: dict(type='ReLU').
+        conv_kernel_size (int): The kernel size of convolution in
+            Kernel Update Head for dynamic kernel updation.
+            Default: 1.
+        feat_transform_cfg (dict | None): Config of feature transform.
+            Default: None.
+        kernel_init (bool): Whether initiate mask kernel in mask head.
+            Default: False.
+        with_ffn (bool): Whether add FFN in kernel update head.
+            Default: True.
+        feat_gather_stride (int): Stride of convolution in feature transform.
+            Default: 1.
+        mask_transform_stride (int): Stride of mask transform.
+            Default: 1.
+        kernel_updator_cfg (dict): Config of kernel updator.
+            Default: dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')).
+    """
+
+    def __init__(self,
+                 num_classes=150,
+                 num_ffn_fcs=2,
+                 num_heads=8,
+                 num_mask_fcs=3,
+                 feedforward_channels=2048,
+                 in_channels=256,
+                 out_channels=256,
+                 dropout=0.0,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_act_cfg=dict(type='ReLU', inplace=True),
+                 conv_kernel_size=1,
+                 feat_transform_cfg=None,
+                 kernel_init=False,
+                 with_ffn=True,
+                 feat_gather_stride=1,
+                 mask_transform_stride=1,
+                 kernel_updator_cfg=dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN'))):
+        super(KernelUpdateHead, self).__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.fp16_enabled = False
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.kernel_init = kernel_init
+        self.with_ffn = with_ffn
+        self.conv_kernel_size = conv_kernel_size
+        self.feat_gather_stride = feat_gather_stride
+        self.mask_transform_stride = mask_transform_stride
+
+        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
+                                            num_heads, dropout)
+        self.attention_norm = build_norm_layer(
+            dict(type='LN'), in_channels * conv_kernel_size**2)[1]
+        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)
+
+        if feat_transform_cfg is not None:
+            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
+            transform_channels = in_channels
+            self.feat_transform = ConvModule(
+                transform_channels,
+                in_channels,
+                kernel_size,
+                stride=feat_gather_stride,
+                padding=int(feat_gather_stride // 2),
+                **feat_transform_cfg)
+        else:
+            self.feat_transform = None
+
+        if self.with_ffn:
+            self.ffn = FFN(
+                in_channels,
+                feedforward_channels,
+                num_ffn_fcs,
+                act_cfg=ffn_act_cfg,
+                dropout=dropout)
+            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.mask_fcs = nn.ModuleList()
+        for _ in range(num_mask_fcs):
+            self.mask_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.mask_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.mask_fcs.append(build_activation_layer(act_cfg))
+
+        self.fc_mask = nn.Linear(in_channels, out_channels)
+
+    def init_weights(self):
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.kernel_init:
+            logger = get_root_logger()
+            logger.info(
+                'mask kernel in mask head is normal initialized by std 0.01')
+            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)
+
+    def forward(self, x, proposal_feat, mask_preds, mask_shape=None):
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            x (Tensor): Feature map from FPN with shape
+                (batch_size, feature_dimensions, H , W).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+            mask_preds (Tensor): mask prediction from the former stage in shape
+                (batch_size, num_proposals, H, W).
+
+        Returns:
+            Tuple: The first tensor is predicted mask with shape
+            (N, num_classes, H, W), the second tensor is dynamic kernel
+            with shape (N, num_classes, channels, K, K).
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+        if self.feat_transform is not None:
+            x = self.feat_transform(x)
+
+        C, H, W = x.shape[-3:]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        if mask_h != H or mask_w != W:
+            gather_mask = F.interpolate(
+                mask_preds, (H, W), align_corners=False, mode='bilinear')
+        else:
+            gather_mask = mask_preds
+
+        sigmoid_masks = gather_mask.softmax(dim=1)
+
+        # Group Feature Assembling. Eq.(3) in original paper.
+        # einsum is faster than bmm by 30%
+        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)
+
+        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
+        proposal_feat = proposal_feat.reshape(N, num_proposals,
+                                              self.in_channels,
+                                              -1).permute(0, 1, 3, 2)
+        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)
+
+        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
+        obj_feat = self.attention_norm(self.attention(obj_feat))
+        # [N, B, K*K*C] -> [B, N, K*K*C]
+        obj_feat = obj_feat.permute(1, 0, 2)
+
+        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)
+
+        # FFN
+        if self.with_ffn:
+            obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        mask_feat = obj_feat
+
+        for reg_layer in self.mask_fcs:
+            mask_feat = reg_layer(mask_feat)
+
+        # [B, N, K*K, C] -> [B, N, C, K*K]
+        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)
+
+        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
+            mask_x = F.interpolate(
+                x, scale_factor=0.5, mode='bilinear', align_corners=False)
+            H, W = mask_x.shape[-2:]
+        else:
+            mask_x = x
+        # group conv is 5x faster than unfold and uses about 1/5 memory
+        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
+        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
+        # but in real training group conv is slower than concat batch
+        # so we keep using concat batch.
+        # fold_x = F.unfold(
+        #     mask_x,
+        #     self.conv_kernel_size,
+        #     padding=int(self.conv_kernel_size // 2))
+        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
+        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
+        # [B, N, C, K*K] -> [B*N, C, K, K]
+        mask_feat = mask_feat.reshape(N, num_proposals, C,
+                                      self.conv_kernel_size,
+                                      self.conv_kernel_size)
+        # [B, C, H, W] -> [1, B*C, H, W]
+        new_mask_preds = []
+        for i in range(N):
+            new_mask_preds.append(
+                F.conv2d(
+                    mask_x[i:i + 1],
+                    mask_feat[i],
+                    padding=int(self.conv_kernel_size // 2)))
+
+        new_mask_preds = torch.cat(new_mask_preds, dim=0)
+        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
+        if self.mask_transform_stride == 2:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                scale_factor=2,
+                mode='bilinear',
+                align_corners=False)
+
+        if mask_shape is not None and mask_shape[0] != H:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                mask_shape,
+                align_corners=False,
+                mode='bilinear')
+
+        return new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
+            N, num_proposals, self.in_channels, self.conv_kernel_size,
+            self.conv_kernel_size)
+
+
+@HEADS.register_module()
+class IterativeDecodeHead(BaseDecodeHead):
+    """K-Net: Towards Unified Image Segmentation.
+
+    This head is the implementation of
+    `K-Net:　<https://arxiv.org/abs/2106.14855>`_.
+
+    Args:
+        num_stages (int): The number of stages (kernel update heads)
+            in IterativeDecodeHead. Default: 3.
+        kernel_generate_head:(dict): Config of kernel generate head which
+            generate mask predictions, dynamic kernels and class predictions
+            for next kernel update heads.
+        kernel_update_head (dict): Config of kernel update head which refine
+            dynamic kernels and class predictions iteratively.
+
+    """
+
+    def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
+                 **kwargs):
+        super(BaseDecodeHead, self).__init__(**kwargs)
+        assert num_stages == len(kernel_update_head)
+        self.num_stages = num_stages
+        self.kernel_generate_head = build_head(kernel_generate_head)
+        self.kernel_update_head = nn.ModuleList()
+        self.align_corners = self.kernel_generate_head.align_corners
+        self.num_classes = self.kernel_generate_head.num_classes
+        self.input_transform = self.kernel_generate_head.input_transform
+        self.ignore_index = self.kernel_generate_head.ignore_index
+
+        for head_cfg in kernel_update_head:
+            self.kernel_update_head.append(build_head(head_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = self.kernel_generate_head._forward_feature(inputs)
+        sem_seg = self.kernel_generate_head.cls_seg(feats)
+        seg_kernels = self.kernel_generate_head.conv_seg.weight.clone()
+        seg_kernels = seg_kernels[None].expand(
+            feats.size(0), *seg_kernels.size())
+
+        stage_segs = [sem_seg]
+        for i in range(self.num_stages):
+            sem_seg, seg_kernels = self.kernel_update_head[i](feats,
+                                                              seg_kernels,
+                                                              sem_seg)
+            stage_segs.append(sem_seg)
+        if self.training:
+            return stage_segs
+        # only return the prediction of the last stage during testing
+        return stage_segs[-1]
+
+    def losses(self, seg_logit, seg_label):
+        losses = dict()
+        for i, logit in enumerate(seg_logit):
+            loss = self.kernel_generate_head.losses(logit, seg_label)
+            for k, v in loss.items():
+                losses[f'{k}.s{i}'] = v
+
+        return losses
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/lraspp_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/lraspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd8d9048b9f6264956f9683217f182142c5a31db
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/lraspp_head.py
@@ -0,0 +1,104 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv import is_tuple_of
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class LRASPPHead(BaseDecodeHead):
+    """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3.
+
+    This head is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        branch_channels (tuple[int]): The number of output channels in every
+            each branch. Default: (32, 64).
+    """
+
+    def __init__(self, branch_channels=(32, 64), **kwargs):
+        super(LRASPPHead, self).__init__(**kwargs)
+        if self.input_transform != 'multiple_select':
+            raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform '
+                             f'must be \'multiple_select\'. But received '
+                             f'\'{self.input_transform}\'')
+        assert is_tuple_of(branch_channels, int)
+        assert len(branch_channels) == len(self.in_channels) - 1
+        self.branch_channels = branch_channels
+
+        self.convs = nn.Sequential()
+        self.conv_ups = nn.Sequential()
+        for i in range(len(branch_channels)):
+            self.convs.add_module(
+                f'conv{i}',
+                nn.Conv2d(
+                    self.in_channels[i], branch_channels[i], 1, bias=False))
+            self.conv_ups.add_module(
+                f'conv_up{i}',
+                ConvModule(
+                    self.channels + branch_channels[i],
+                    self.channels,
+                    1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=False))
+
+        self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1)
+
+        self.aspp_conv = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            bias=False)
+        self.image_pool = nn.Sequential(
+            nn.AvgPool2d(kernel_size=49, stride=(16, 20)),
+            ConvModule(
+                self.in_channels[2],
+                self.channels,
+                1,
+                act_cfg=dict(type='Sigmoid'),
+                bias=False))
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        x = inputs[-1]
+
+        x = self.aspp_conv(x) * resize(
+            self.image_pool(x),
+            size=x.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.conv_up_input(x)
+
+        for i in range(len(self.branch_channels) - 1, -1, -1):
+            x = resize(
+                x,
+                size=inputs[i].size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            x = torch.cat([x, self.convs[i](inputs[i])], 1)
+            x = self.conv_ups[i](x)
+
+        return self.cls_seg(x)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/nl_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/nl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb91b442a3d2d369bc50e1cad11d383a22a8bfd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/nl_head.py
@@ -0,0 +1,63 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from mmcv.cnn import NonLocal2d
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+@HEADS.register_module()
+class NLHead(FCNHead):
+    """Non-local Neural Networks.
+
+    This head is the implementation of `NLNet
+    <https://arxiv.org/abs/1711.07971>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: True.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 **kwargs):
+        super(NLHead, self).__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.nl_block = NonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.nl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ocr_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ocr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c47021cd1d3ac70a655216ab0dcabcb42816f267
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/ocr_head.py
@@ -0,0 +1,141 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+class SpatialGatherModule(nn.Module):
+    """Aggregate the context features according to the initial predicted
+    probability distribution.
+
+    Employ the soft-weighted method to aggregate the context.
+    """
+
+    def __init__(self, scale):
+        super(SpatialGatherModule, self).__init__()
+        self.scale = scale
+
+    def forward(self, feats, probs):
+        """Forward function."""
+        batch_size, num_classes, height, width = probs.size()
+        channels = feats.size(1)
+        probs = probs.view(batch_size, num_classes, -1)
+        feats = feats.view(batch_size, channels, -1)
+        # [batch_size, height*width, num_classes]
+        feats = feats.permute(0, 2, 1)
+        # [batch_size, channels, height*width]
+        probs = F.softmax(self.scale * probs, dim=2)
+        # [batch_size, channels, num_classes]
+        ocr_context = torch.matmul(probs, feats)
+        ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3)
+        return ocr_context
+
+
+class ObjectAttentionBlock(_SelfAttentionBlock):
+    """Make a OCR used SelfAttentionBlock."""
+
+    def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
+                 act_cfg):
+        if scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=scale)
+        else:
+            query_downsample = None
+        super(ObjectAttentionBlock, self).__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=query_downsample,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=True,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.bottleneck = ConvModule(
+            in_channels * 2,
+            in_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        context = super(ObjectAttentionBlock,
+                        self).forward(query_feats, key_feats)
+        output = self.bottleneck(torch.cat([context, query_feats], dim=1))
+        if self.query_downsample is not None:
+            output = resize(query_feats)
+
+        return output
+
+
+@HEADS.register_module()
+class OCRHead(BaseCascadeDecodeHead):
+    """Object-Contextual Representations for Semantic Segmentation.
+
+    This head is the implementation of `OCRNet
+    <https://arxiv.org/abs/1909.11065>`_.
+
+    Args:
+        ocr_channels (int): The intermediate channels of OCR block.
+        scale (int): The scale of probability map in SpatialGatherModule in
+            Default: 1.
+    """
+
+    def __init__(self, ocr_channels, scale=1, **kwargs):
+        super(OCRHead, self).__init__(**kwargs)
+        self.ocr_channels = ocr_channels
+        self.scale = scale
+        self.object_context_block = ObjectAttentionBlock(
+            self.channels,
+            self.ocr_channels,
+            self.scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.spatial_gather_module = SpatialGatherModule(self.scale)
+
+        self.bottleneck = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs, prev_output):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.bottleneck(x)
+        context = self.spatial_gather_module(feats, prev_output)
+        object_context = self.object_context_block(feats, context)
+        output = self.cls_seg(object_context)
+
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/point_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..139cc22caccebcabf5cfe8b22c58bfc76c216213
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/point_head.py
@@ -0,0 +1,377 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+try:
+    from mmcv.ops import point_sample
+except ModuleNotFoundError:
+    point_sample = None
+
+from mmseg.models.builder import HEADS
+from mmseg.ops import resize
+from ..losses import accuracy
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+def calculate_uncertainty(seg_logits):
+    """Estimate uncertainty based on seg logits.
+
+    For each location of the prediction ``seg_logits`` we estimate
+    uncertainty as the difference between top first and top second
+    predicted logits.
+
+    Args:
+        seg_logits (Tensor): Semantic segmentation logits,
+            shape (batch_size, num_classes, height, width).
+
+    Returns:
+        scores (Tensor): T uncertainty scores with the most uncertain
+            locations having the highest uncertainty score, shape (
+            batch_size, 1, height, width)
+    """
+    top2_scores = torch.topk(seg_logits, k=2, dim=1)[0]
+    return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
+
+
+@HEADS.register_module()
+class PointHead(BaseCascadeDecodeHead):
+    """A mask point head use in PointRend.
+
+    This head is implemented of `PointRend: Image Segmentation as
+    Rendering <https://arxiv.org/abs/1912.08193>`_.
+    ``PointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Default: 3.
+        in_channels (int): Number of input channels. Default: 256.
+        fc_channels (int): Number of fc channels. Default: 256.
+        num_classes (int): Number of classes for logits. Default: 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Default: False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg (dict|None): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d'))
+        norm_cfg (dict|None): Dictionary to construct and config norm layer.
+            Default: None.
+        loss_point (dict): Dictionary to construct and config loss layer of
+            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
+            loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_fcs=3,
+                 coarse_pred_each_layer=True,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU', inplace=False),
+                 **kwargs):
+        super(PointHead, self).__init__(
+            input_transform='multiple_select',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=dict(
+                type='Normal', std=0.01, override=dict(name='fc_seg')),
+            **kwargs)
+        if point_sample is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'point_sample ops')
+
+        self.num_fcs = num_fcs
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+
+        fc_in_channels = sum(self.in_channels) + self.num_classes
+        fc_channels = self.channels
+        self.fcs = nn.ModuleList()
+        for k in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += self.num_classes if self.coarse_pred_each_layer \
+                else 0
+        self.fc_seg = nn.Conv1d(
+            fc_in_channels,
+            self.num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+        delattr(self, 'conv_seg')
+
+    def cls_seg(self, feat):
+        """Classify each pixel with fc."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.fc_seg(feat)
+        return output
+
+    def forward(self, fine_grained_point_feats, coarse_point_feats):
+        x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_point_feats), dim=1)
+        return self.cls_seg(x)
+
+    def _get_fine_grained_point_feats(self, x, points):
+        """Sample from fine grained features.
+
+        Args:
+            x (list[Tensor]): Feature pyramid from by neck or backbone.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            fine_grained_feats (Tensor): Sampled fine grained feature,
+                shape (batch_size, sum(channels of x), num_points).
+        """
+
+        fine_grained_feats_list = [
+            point_sample(_, points, align_corners=self.align_corners)
+            for _ in x
+        ]
+        if len(fine_grained_feats_list) > 1:
+            fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1)
+        else:
+            fine_grained_feats = fine_grained_feats_list[0]
+
+        return fine_grained_feats
+
+    def _get_coarse_point_feats(self, prev_output, points):
+        """Sample from fine grained features.
+
+        Args:
+            prev_output (list[Tensor]): Prediction of previous decode head.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
+                num_classes, num_points).
+        """
+
+        coarse_feats = point_sample(
+            prev_output, points, align_corners=self.align_corners)
+
+        return coarse_feats
+
+    def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
+                      train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self._transform_inputs(inputs)
+        with torch.no_grad():
+            points = self.get_points_train(
+                prev_output, calculate_uncertainty, cfg=train_cfg)
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, points)
+        coarse_point_feats = self._get_coarse_point_feats(prev_output, points)
+        point_logits = self.forward(fine_grained_point_feats,
+                                    coarse_point_feats)
+        point_label = point_sample(
+            gt_semantic_seg.float(),
+            points,
+            mode='nearest',
+            align_corners=self.align_corners)
+        point_label = point_label.squeeze(1).long()
+
+        losses = self.losses(point_logits, point_label)
+
+        return losses
+
+    def forward_test(self, inputs, prev_output, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+
+        x = self._transform_inputs(inputs)
+        refined_seg_logits = prev_output.clone()
+        for _ in range(test_cfg.subdivision_steps):
+            refined_seg_logits = resize(
+                refined_seg_logits,
+                scale_factor=test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            batch_size, channels, height, width = refined_seg_logits.shape
+            point_indices, points = self.get_points_test(
+                refined_seg_logits, calculate_uncertainty, cfg=test_cfg)
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x, points)
+            coarse_point_feats = self._get_coarse_point_feats(
+                prev_output, points)
+            point_logits = self.forward(fine_grained_point_feats,
+                                        coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_seg_logits = refined_seg_logits.reshape(
+                batch_size, channels, height * width)
+            refined_seg_logits = refined_seg_logits.scatter_(
+                2, point_indices, point_logits)
+            refined_seg_logits = refined_seg_logits.view(
+                batch_size, channels, height, width)
+
+        return refined_seg_logits
+
+    def losses(self, point_logits, point_label):
+        """Compute segmentation loss."""
+        loss = dict()
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_module in losses_decode:
+            loss['point' + loss_module.loss_name] = loss_module(
+                point_logits, point_label, ignore_index=self.ignore_index)
+
+        loss['acc_point'] = accuracy(
+            point_logits, point_label, ignore_index=self.ignore_index)
+        return loss
+
+    def get_points_train(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for training.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        'uncertainty_func' function that takes point's logit prediction as
+        input.
+
+        Args:
+            seg_logits (Tensor): Semantic segmentation logits, shape (
+                batch_size, num_classes, height, width).
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains the coordinates of ``num_points`` sampled
+                points.
+        """
+        num_points = cfg.num_points
+        oversample_ratio = cfg.oversample_ratio
+        importance_sample_ratio = cfg.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = seg_logits.shape[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = torch.rand(
+            batch_size, num_sampled, 2, device=seg_logits.device)
+        point_logits = point_sample(seg_logits, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = uncertainty_func(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_sampled * torch.arange(
+            batch_size, dtype=torch.long, device=seg_logits.device)
+        idx += shift[:, None]
+        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+            batch_size, num_uncertain_points, 2)
+        if num_random_points > 0:
+            rand_point_coords = torch.rand(
+                batch_size, num_random_points, 2, device=seg_logits.device)
+            point_coords = torch.cat((point_coords, rand_point_coords), dim=1)
+        return point_coords
+
+    def get_points_test(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for testing.
+
+        Find ``num_points`` most uncertain points from ``uncertainty_map``.
+
+        Args:
+            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
+                height, width) for class-specific or class-agnostic prediction.
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Testing config of point head.
+
+        Returns:
+            point_indices (Tensor): A tensor of shape (batch_size, num_points)
+                that contains indices from [0, height x width) of the most
+                uncertain points.
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the ``height x width`` grid .
+        """
+
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = uncertainty_func(seg_logits)
+        batch_size, _, height, width = uncertainty_map.shape
+        h_step = 1.0 / height
+        w_step = 1.0 / width
+
+        uncertainty_map = uncertainty_map.view(batch_size, height * width)
+        num_points = min(height * width, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        point_coords = torch.zeros(
+            batch_size,
+            num_points,
+            2,
+            dtype=torch.float,
+            device=seg_logits.device)
+        point_coords[:, :, 0] = w_step / 2.0 + (point_indices %
+                                                width).float() * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (point_indices //
+                                                width).float() * h_step
+        return point_indices, point_coords
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psa_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f427798c55d8ab62db4a51b5b4806a4219f9e3e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psa_head.py
@@ -0,0 +1,210 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+try:
+    from mmcv.ops import PSAMask
+except ModuleNotFoundError:
+    PSAMask = None
+
+
+@HEADS.register_module()
+class PSAHead(BaseDecodeHead):
+    """Point-wise Spatial Attention Network for Scene Parsing.
+
+    This head is the implementation of `PSANet
+    <https://hszhao.github.io/papers/eccv18_psanet.pdf>`_.
+
+    Args:
+        mask_size (tuple[int]): The PSA mask size. It usually equals input
+            size.
+        psa_type (str): The type of psa module. Options are 'collect',
+            'distribute', 'bi-direction'. Default: 'bi-direction'
+        compact (bool): Whether use compact map for 'collect' mode.
+            Default: True.
+        shrink_factor (int): The downsample factors of psa mask. Default: 2.
+        normalization_factor (float): The normalize factor of attention.
+        psa_softmax (bool): Whether use softmax for attention.
+    """
+
+    def __init__(self,
+                 mask_size,
+                 psa_type='bi-direction',
+                 compact=False,
+                 shrink_factor=2,
+                 normalization_factor=1.0,
+                 psa_softmax=True,
+                 **kwargs):
+        if PSAMask is None:
+            raise RuntimeError('Please install mmcv-full for PSAMask ops')
+        super(PSAHead, self).__init__(**kwargs)
+        assert psa_type in ['collect', 'distribute', 'bi-direction']
+        self.psa_type = psa_type
+        self.compact = compact
+        self.shrink_factor = shrink_factor
+        self.mask_size = mask_size
+        mask_h, mask_w = mask_size
+        self.psa_softmax = psa_softmax
+        if normalization_factor is None:
+            normalization_factor = mask_h * mask_w
+        self.normalization_factor = normalization_factor
+
+        self.reduce = ConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.attention = nn.Sequential(
+            ConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Conv2d(
+                self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+        if psa_type == 'bi-direction':
+            self.reduce_p = ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            self.attention_p = nn.Sequential(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                nn.Conv2d(
+                    self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+            self.psamask_collect = PSAMask('collect', mask_size)
+            self.psamask_distribute = PSAMask('distribute', mask_size)
+        else:
+            self.psamask = PSAMask(psa_type, mask_size)
+        self.proj = ConvModule(
+            self.channels * (2 if psa_type == 'bi-direction' else 1),
+            self.in_channels,
+            kernel_size=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            self.in_channels * 2,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        identity = x
+        align_corners = self.align_corners
+        if self.psa_type in ['collect', 'distribute']:
+            out = self.reduce(x)
+            n, c, h, w = out.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                out = resize(
+                    out,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y = self.attention(out)
+            if self.compact:
+                if self.psa_type == 'collect':
+                    y = y.view(n, h * w,
+                               h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y = self.psamask(y)
+            if self.psa_softmax:
+                y = F.softmax(y, dim=1)
+            out = torch.bmm(
+                out.view(n, c, h * w), y.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+        else:
+            x_col = self.reduce(x)
+            x_dis = self.reduce_p(x)
+            n, c, h, w = x_col.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                x_col = resize(
+                    x_col,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+                x_dis = resize(
+                    x_dis,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y_col = self.attention(x_col)
+            y_dis = self.attention_p(x_dis)
+            if self.compact:
+                y_dis = y_dis.view(n, h * w,
+                                   h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y_col = self.psamask_collect(y_col)
+                y_dis = self.psamask_distribute(y_dis)
+            if self.psa_softmax:
+                y_col = F.softmax(y_col, dim=1)
+                y_dis = F.softmax(y_dis, dim=1)
+            x_col = torch.bmm(
+                x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            x_dis = torch.bmm(
+                x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            out = torch.cat([x_col, x_dis], 1)
+        out = self.proj(out)
+        out = resize(
+            out,
+            size=identity.shape[2:],
+            mode='bilinear',
+            align_corners=align_corners)
+        out = self.bottleneck(torch.cat((identity, out), dim=1))
+        out = self.cls_seg(out)
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psp_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7abf0622763685203681db0e757035bb5821d4c6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/psp_head.py
@@ -0,0 +1,130 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+class PPM(nn.ModuleList):
+    """Pooling Pyramid Module used in PSPNet.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+    """
+
+    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg, align_corners, **kwargs):
+        super(PPM, self).__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for pool_scale in pool_scales:
+            self.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(pool_scale),
+                    ConvModule(
+                        self.in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        **kwargs)))
+
+    def forward(self, x):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = resize(
+                ppm_out,
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+
+
+@HEADS.register_module()
+class PSPHead(BaseDecodeHead):
+    """Pyramid Scene Parsing Network.
+
+    This head is the implementation of
+    `PSPNet <https://arxiv.org/abs/1612.01105>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(PSPHead, self).__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.psp_modules = PPM(
+            self.pool_scales,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        feats = self.bottleneck(psp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segformer_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a1f1dc5bd31ba6533dde52d0340bf6f7c93fe3c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segformer_head.py
@@ -0,0 +1,80 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.models.builder import HEADS
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.ops import resize
+
+
+@HEADS.register_module()
+class SegformerHead(BaseDecodeHead):
+    """The all mlp Head of segformer.
+
+    This head is the implementation of
+    `Segformer <https://arxiv.org/abs/2105.15203>` _.
+
+    Args:
+        interpolate_mode: The interpolate mode of MLP head upsample operation.
+            Default: 'bilinear'.
+    """
+
+    def __init__(self, interpolate_mode='bilinear', **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+
+        self.interpolate_mode = interpolate_mode
+        num_inputs = len(self.in_channels)
+
+        assert num_inputs == len(self.in_index)
+
+        self.convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.channels,
+                    kernel_size=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+        self.fusion_conv = ConvModule(
+            in_channels=self.channels * num_inputs,
+            out_channels=self.channels,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            conv = self.convs[idx]
+            outs.append(
+                resize(
+                    input=conv(x),
+                    size=inputs[0].shape[2:],
+                    mode=self.interpolate_mode,
+                    align_corners=self.align_corners))
+
+        out = self.fusion_conv(torch.cat(outs, dim=1))
+
+        out = self.cls_seg(out)
+
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segmenter_mask_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segmenter_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e63cfc7bda1ad2e781aeffe3278571ab9840264
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/segmenter_mask_head.py
@@ -0,0 +1,146 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.utils.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmcv.runner import ModuleList
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class SegmenterMaskTransformerHead(BaseDecodeHead):
+    """Segmenter: Transformer for Semantic Segmentation.
+
+    This head is the implementation of
+    `Segmenter:　<https://arxiv.org/abs/2105.05633>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input image.
+        num_layers (int): The depth of transformer.
+        num_heads (int): The number of attention heads.
+        embed_dims (int): The number of embedding dimension.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        init_std (float): The value of std in weight initialization.
+            Default: 0.02.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            num_layers,
+            num_heads,
+            embed_dims,
+            mlp_ratio=4,
+            drop_path_rate=0.1,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            num_fcs=2,
+            qkv_bias=True,
+            act_cfg=dict(type='GELU'),
+            norm_cfg=dict(type='LN'),
+            init_std=0.02,
+            **kwargs,
+    ):
+        super(SegmenterMaskTransformerHead, self).__init__(
+            in_channels=in_channels, **kwargs)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_layers)]
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    batch_first=True,
+                ))
+
+        self.dec_proj = nn.Linear(in_channels, embed_dims)
+
+        self.cls_emb = nn.Parameter(
+            torch.randn(1, self.num_classes, embed_dims))
+        self.patch_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+        self.classes_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+
+        self.decoder_norm = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)[1]
+        self.mask_norm = build_norm_layer(
+            norm_cfg, self.num_classes, postfix=2)[1]
+
+        self.init_std = init_std
+
+        delattr(self, 'conv_seg')
+
+    def init_weights(self):
+        trunc_normal_(self.cls_emb, std=self.init_std)
+        trunc_normal_init(self.patch_proj, std=self.init_std)
+        trunc_normal_init(self.classes_proj, std=self.init_std)
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=self.init_std, bias=0)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m, val=1.0, bias=0.0)
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        b, c, h, w = x.shape
+        x = x.permute(0, 2, 3, 1).contiguous().view(b, -1, c)
+
+        x = self.dec_proj(x)
+        cls_emb = self.cls_emb.expand(x.size(0), -1, -1)
+        x = torch.cat((x, cls_emb), 1)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.decoder_norm(x)
+
+        patches = self.patch_proj(x[:, :-self.num_classes])
+        cls_seg_feat = self.classes_proj(x[:, -self.num_classes:])
+
+        patches = F.normalize(patches, dim=2, p=2)
+        cls_seg_feat = F.normalize(cls_seg_feat, dim=2, p=2)
+
+        masks = patches @ cls_seg_feat.transpose(1, 2)
+        masks = self.mask_norm(masks)
+        masks = masks.permute(0, 2, 1).contiguous().view(b, -1, h, w)
+
+        return masks
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_aspp_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca04892e71b16a9efd8da6b835b669689aaa82ff
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_aspp_head.py
@@ -0,0 +1,115 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .aspp_head import ASPPHead, ASPPModule
+
+
+class DepthwiseSeparableASPPModule(ASPPModule):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable
+    conv."""
+
+    def __init__(self, **kwargs):
+        super(DepthwiseSeparableASPPModule, self).__init__(**kwargs)
+        for i, dilation in enumerate(self.dilations):
+            if dilation > 1:
+                self[i] = DepthwiseSeparableConvModule(
+                    self.in_channels,
+                    self.channels,
+                    3,
+                    dilation=dilation,
+                    padding=dilation,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+
+
+@HEADS.register_module()
+class DepthwiseSeparableASPPHead(ASPPHead):
+    """Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation.
+
+    This head is the implementation of `DeepLabV3+
+    <https://arxiv.org/abs/1802.02611>`_.
+
+    Args:
+        c1_in_channels (int): The input channels of c1 decoder. If is 0,
+            the no decoder will be used.
+        c1_channels (int): The intermediate channels of c1 decoder.
+    """
+
+    def __init__(self, c1_in_channels, c1_channels, **kwargs):
+        super(DepthwiseSeparableASPPHead, self).__init__(**kwargs)
+        assert c1_in_channels >= 0
+        self.aspp_modules = DepthwiseSeparableASPPModule(
+            dilations=self.dilations,
+            in_channels=self.in_channels,
+            channels=self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if c1_in_channels > 0:
+            self.c1_bottleneck = ConvModule(
+                c1_in_channels,
+                c1_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            self.c1_bottleneck = None
+        self.sep_bottleneck = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                self.channels + c1_channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        output = self.bottleneck(aspp_outs)
+        if self.c1_bottleneck is not None:
+            c1_output = self.c1_bottleneck(inputs[0])
+            output = resize(
+                input=output,
+                size=c1_output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            output = torch.cat([output, c1_output], dim=1)
+        output = self.sep_bottleneck(output)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_fcn_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f1e94493228605e9792c7a9c64b749cdbc45ea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/sep_fcn_head.py
@@ -0,0 +1,73 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.cnn import DepthwiseSeparableConvModule
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+@HEADS.register_module()
+class DepthwiseSeparableFCNHead(FCNHead):
+    """Depthwise-Separable Fully Convolutional Network for Semantic
+    Segmentation.
+
+    This head is implemented according to `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels(int): Number of output channels of FFM.
+        channels(int): Number of middle-stage channels in the decode head.
+        concat_input(bool): Whether to concatenate original decode input into
+            the result of several consecutive convolution layers.
+            Default: True.
+        num_classes(int): Used to determine the dimension of
+            final prediction tensor.
+        in_index(int): Correspond with 'out_indices' in FastSCNN backbone.
+        norm_cfg (dict | None): Config of norm layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_decode(dict): Config of loss type and some
+            relevant additional options.
+        dw_act_cfg (dict):Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: None.
+    """
+
+    def __init__(self, dw_act_cfg=None, **kwargs):
+        super(DepthwiseSeparableFCNHead, self).__init__(**kwargs)
+        self.convs[0] = DepthwiseSeparableConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=self.kernel_size,
+            padding=self.kernel_size // 2,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=dw_act_cfg)
+
+        for i in range(1, self.num_convs):
+            self.convs[i] = DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
+
+        if self.concat_input:
+            self.conv_cat = DepthwiseSeparableConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_mla_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_mla_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f209e214026e3b51cf53315c056aaa661a66556
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_mla_head.py
@@ -0,0 +1,76 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import Upsample
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class SETRMLAHead(BaseDecodeHead):
+    """Multi level feature aggretation head of SETR.
+
+    MLA head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        mlahead_channels (int): Channels of conv-conv-4x of multi-level feature
+            aggregation. Default: 128.
+        up_scale (int): The scale factor of interpolate. Default:4.
+    """
+
+    def __init__(self, mla_channels=128, up_scale=4, **kwargs):
+        super(SETRMLAHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        self.mla_channels = mla_channels
+
+        num_inputs = len(self.in_channels)
+
+        # Refer to self.cls_seg settings of BaseDecodeHead
+        assert self.channels == num_inputs * mla_channels
+
+        self.up_convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=mla_channels,
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+
+    def forward(self, inputs):
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for x, up_conv in zip(inputs, self.up_convs):
+            outs.append(up_conv(x))
+        out = torch.cat(outs, dim=1)
+        out = self.cls_seg(out)
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_up_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_up_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96625d8d8feb4553e4f23b642fda378d8f9d882
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/setr_up_head.py
@@ -0,0 +1,94 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.ops import Upsample
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class SETRUPHead(BaseDecodeHead):
+    """Naive upsampling head and Progressive upsampling head of SETR.
+
+    Naive or PUP head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        num_convs (int): Number of decoder convolutions. Default: 1.
+        up_scale (int): The scale factor of interpolate. Default:4.
+        kernel_size (int): The kernel size of convolution when decoding
+            feature information from backbone. Default: 3.
+        init_cfg (dict | list[dict] | None): Initialization config dict.
+            Default: dict(
+                     type='Constant', val=1.0, bias=0, layer='LayerNorm').
+    """
+
+    def __init__(self,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 num_convs=1,
+                 up_scale=4,
+                 kernel_size=3,
+                 init_cfg=[
+                     dict(type='Constant', val=1.0, bias=0, layer='LayerNorm'),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         override=dict(name='conv_seg'))
+                 ],
+                 **kwargs):
+
+        assert kernel_size in [1, 3], 'kernel_size must be 1 or 3.'
+
+        super(SETRUPHead, self).__init__(init_cfg=init_cfg, **kwargs)
+
+        assert isinstance(self.in_channels, int)
+
+        _, self.norm = build_norm_layer(norm_layer, self.in_channels)
+
+        self.up_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        out_channels = self.channels
+        for _ in range(num_convs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=int(kernel_size - 1) // 2,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+            in_channels = out_channels
+
+    def forward(self, x):
+        x = self._transform_inputs(x)
+
+        n, c, h, w = x.shape
+        x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+        x = self.norm(x)
+        x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+
+        for up_conv in self.up_convs:
+            x = up_conv(x)
+        out = self.cls_seg(x)
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/stdc_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/stdc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7a2bad3d66e19afb2d29870801eda69bc41b36
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/stdc_head.py
@@ -0,0 +1,98 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+@HEADS.register_module()
+class STDCHead(FCNHead):
+    """This head is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        boundary_threshold (float): The threshold of calculating boundary.
+            Default: 0.1.
+    """
+
+    def __init__(self, boundary_threshold=0.1, **kwargs):
+        super(STDCHead, self).__init__(**kwargs)
+        self.boundary_threshold = boundary_threshold
+        # Using register buffer to make laplacian kernel on the same
+        # device of `seg_label`.
+        self.register_buffer(
+            'laplacian_kernel',
+            torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1],
+                         dtype=torch.float32,
+                         requires_grad=False).reshape((1, 1, 3, 3)))
+        self.fusion_kernel = torch.nn.Parameter(
+            torch.tensor([[6. / 10], [3. / 10], [1. / 10]],
+                         dtype=torch.float32).reshape(1, 3, 1, 1),
+            requires_grad=False)
+
+    def losses(self, seg_logit, seg_label):
+        """Compute Detail Aggregation Loss."""
+        # Note: The paper claims `fusion_kernel` is a trainable 1x1 conv
+        # parameters. However, it is a constant in original repo and other
+        # codebase because it would not be added into computation graph
+        # after threshold operation.
+        seg_label = seg_label.to(self.laplacian_kernel)
+        boundary_targets = F.conv2d(
+            seg_label, self.laplacian_kernel, padding=1)
+        boundary_targets = boundary_targets.clamp(min=0)
+        boundary_targets[boundary_targets > self.boundary_threshold] = 1
+        boundary_targets[boundary_targets <= self.boundary_threshold] = 0
+
+        boundary_targets_x2 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=2, padding=1)
+        boundary_targets_x2 = boundary_targets_x2.clamp(min=0)
+
+        boundary_targets_x4 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=4, padding=1)
+        boundary_targets_x4 = boundary_targets_x4.clamp(min=0)
+
+        boundary_targets_x4_up = F.interpolate(
+            boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x2_up = F.interpolate(
+            boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
+
+        boundary_targets_x2_up[
+            boundary_targets_x2_up > self.boundary_threshold] = 1
+        boundary_targets_x2_up[
+            boundary_targets_x2_up <= self.boundary_threshold] = 0
+
+        boundary_targets_x4_up[
+            boundary_targets_x4_up > self.boundary_threshold] = 1
+        boundary_targets_x4_up[
+            boundary_targets_x4_up <= self.boundary_threshold] = 0
+
+        boundary_targets_pyramids = torch.stack(
+            (boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
+            dim=1)
+
+        boundary_targets_pyramids = boundary_targets_pyramids.squeeze(2)
+        boundary_targets_pyramid = F.conv2d(boundary_targets_pyramids,
+                                            self.fusion_kernel)
+
+        boundary_targets_pyramid[
+            boundary_targets_pyramid > self.boundary_threshold] = 1
+        boundary_targets_pyramid[
+            boundary_targets_pyramid <= self.boundary_threshold] = 0
+
+        loss = super(STDCHead, self).losses(seg_logit,
+                                            boundary_targets_pyramid.long())
+        return loss
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/uper_head.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/uper_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8799a4fbcd79e4f85536718db12e9b328d2f7270
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/decode_heads/uper_head.py
@@ -0,0 +1,153 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.ops import resize
+from ..builder import HEADS
+from .decode_head import BaseDecodeHead
+from .psp_head import PPM
+
+
+@HEADS.register_module()
+class UPerHead(BaseDecodeHead):
+    """Unified Perceptual Parsing for Scene Understanding.
+
+    This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module applied on the last feature. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(UPerHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        # PSP Module
+        self.psp_modules = PPM(
+            pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels[-1] + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels[:-1]:  # skip the top layer
+            l_conv = ConvModule(
+                in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        self.fpn_bottleneck = ConvModule(
+            len(self.in_channels) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def psp_forward(self, inputs):
+        """Forward function of PSP module."""
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+
+        return output
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        inputs = self._transform_inputs(inputs)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        laterals.append(self.psp_forward(inputs))
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + resize(
+                laterals[i],
+                size=prev_shape,
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        # build outputs
+        fpn_outs = [
+            self.fpn_convs[i](laterals[i])
+            for i in range(used_backbone_levels - 1)
+        ]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = resize(
+                fpn_outs[i],
+                size=fpn_outs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        fpn_outs = torch.cat(fpn_outs, dim=1)
+        feats = self.fpn_bottleneck(fpn_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd8aa520fac422ebce19a0257fc67fb57f9c7ca2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/__init__.py
@@ -0,0 +1,29 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .accuracy import Accuracy, accuracy
+from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+                                 cross_entropy, mask_cross_entropy)
+from .dice_loss import DiceLoss
+from .focal_loss import FocalLoss
+from .lovasz_loss import LovaszLoss
+from .tversky_loss import TverskyLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss',
+    'FocalLoss', 'TverskyLoss'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/accuracy.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc19c2d338fadc90bd35f8dab8b2fc7f6900ee6c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/accuracy.py
@@ -0,0 +1,105 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None, ignore_index=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class, ...)
+        target (torch.Tensor): The target of each prediction, shape (N, , ...)
+        ignore_index (int | None): The label index to be ignored. Default: None
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == target.ndim + 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    # transpose to shape (maxk, N, ...)
+    pred_label = pred_label.transpose(0, 1)
+    correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    if ignore_index is not None:
+        correct = correct[:, target != ignore_index]
+    res = []
+    eps = torch.finfo(torch.float32).eps
+    for k in topk:
+        # Avoid causing ZeroDivisionError when all pixels
+        # of an image are ignored
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + eps
+        if ignore_index is not None:
+            total_num = target[target != ignore_index].numel() + eps
+        else:
+            total_num = target.numel() + eps
+        res.append(correct_k.mul_(100.0 / total_num))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+    """Accuracy calculation module."""
+
+    def __init__(self, topk=(1, ), thresh=None, ignore_index=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+        self.ignore_index = ignore_index
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh,
+                        self.ignore_index)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/cross_entropy_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..022a2ea863ce760cddce738ff6258eb719606d62
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/cross_entropy_loss.py
@@ -0,0 +1,312 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  class_weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """cross_entropy. The wrapper function for :func:`F.cross_entropy`
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+            Default: None.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Default: None.
+        ignore_index (int): Specifies a target value that is ignored and
+            does not contribute to the input gradients. When
+            ``avg_non_ignore `` is ``True``, and the ``reduction`` is
+            ``''mean''``, the loss is averaged over non-ignored targets.
+            Defaults: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    # class_weight is a manual rescaling weight given to each class.
+    # If given, has to be a Tensor of size C element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+    loss = loss.npu()
+    # apply weights and do the reduction
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_zeros(target_shape)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(valid_mask, as_tuple=True)
+
+    if inds[0].numel() > 0:
+        if labels.dim() == 3:
+            bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1
+        else:
+            bin_labels[inds[0], labels[valid_mask]] = 1
+
+    valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float()
+
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.unsqueeze(1).expand(target_shape)
+        bin_label_weights = bin_label_weights * valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False,
+                         **kwargs):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+            Note: In bce loss, label < 0 is invalid.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int): The label index to be ignored. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    if pred.size(1) == 1:
+        # For binary class segmentation, the shape of pred is
+        # [N, 1, H, W] and that of label is [N, H, W].
+        # As the ignore_index often set as 255, so the
+        # binary class label check should mask out
+        # ignore_index
+        assert label[label != ignore_index].max() <= 1, \
+            'For pred with shape [N, 1, H, W], its label must have at ' \
+            'most 2 classes'
+        pred = pred.squeeze()
+    if pred.dim() != label.dim():
+        assert (pred.dim() == 2 and label.dim() == 1) or (
+                pred.dim() == 4 and label.dim() == 3), \
+            'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \
+            'H, W], label shape [N, H, W] are supported'
+        # `weight` returned from `_expand_onehot_labels`
+        # has been treated for valid (non-ignore) pixels
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.shape, ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+    # average loss over non-ignored and valid elements
+    if reduction == 'mean' and avg_factor is None and avg_non_ignore:
+        avg_factor = valid_mask.sum().item()
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask'
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@LOSSES.register_module()
+class CrossEntropyLoss(nn.Module):
+    """CrossEntropyLoss.
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+            of softmax. Defaults to False.
+        use_mask (bool, optional): Whether to use mask cross entropy loss.
+            Defaults to False.
+        reduction (str, optional): . Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_ce',
+                 avg_non_ignore=False):
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self.avg_non_ignore = avg_non_ignore
+        if not self.avg_non_ignore and self.reduction == 'mean':
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+        self._loss_name = loss_name
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=-100,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        cls_score = cls_score.cpu()
+        label = label.cpu()
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+        # Note: for BCE loss, label < 0 is invalid.
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            avg_non_ignore=self.avg_non_ignore,
+            ignore_index=ignore_index,
+            **kwargs)
+        loss_cls = loss_cls.npu()
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/dice_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb21f910bb7ec2007a7f0c76ffbaaa018cff8ba9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/dice_loss.py
@@ -0,0 +1,150 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/
+segmentron/solver/loss.py (Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weighted_loss
+
+
+@weighted_loss
+def dice_loss(pred,
+              target,
+              valid_mask,
+              smooth=1,
+              exponent=2,
+              class_weight=None,
+              ignore_index=255):
+    assert pred.shape[0] == target.shape[0]
+    total_loss = 0
+    num_classes = pred.shape[1]
+    for i in range(num_classes):
+        if i != ignore_index:
+            dice_loss = binary_dice_loss(
+                pred[:, i],
+                target[..., i],
+                valid_mask=valid_mask,
+                smooth=smooth,
+                exponent=exponent)
+            if class_weight is not None:
+                dice_loss *= class_weight[i]
+            total_loss += dice_loss
+    return total_loss / num_classes
+
+
+@weighted_loss
+def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwargs):
+    assert pred.shape[0] == target.shape[0]
+    pred = pred.reshape(pred.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+
+    num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth
+    den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth
+
+    return 1 - num / den
+
+
+@LOSSES.register_module()
+class DiceLoss(nn.Module):
+    """DiceLoss.
+
+    This loss is proposed in `V-Net: Fully Convolutional Neural Networks for
+    Volumetric Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
+
+    Args:
+        smooth (float): A float number to smooth loss, and avoid NaN error.
+            Default: 1
+        exponent (float): An float number to calculate denominator
+            value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_dice'.
+    """
+
+    def __init__(self,
+                 smooth=1,
+                 exponent=2,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 loss_name='loss_dice',
+                 **kwargs):
+        super(DiceLoss, self).__init__()
+        self.smooth = smooth
+        self.exponent = exponent
+        self.reduction = reduction
+        self.class_weight = get_class_weight(class_weight)
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred = F.softmax(pred, dim=1)
+        num_classes = pred.shape[1]
+        one_hot_target = F.one_hot(
+            torch.clamp(target.long(), 0, num_classes - 1),
+            num_classes=num_classes)
+        valid_mask = (target != self.ignore_index).long()
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            one_hot_target,
+            valid_mask=valid_mask,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            smooth=self.smooth,
+            exponent=self.exponent,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/focal_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b04d1935ec1dc56607c0da524f2d22474631420
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/focal_loss.py
@@ -0,0 +1,340 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from https://github.com/open-mmlab/mmdetection
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is used when cuda is not available
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          one_hot_target=None,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.5,
+                          class_weight=None,
+                          valid_mask=None,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction with
+            shape (N, C)
+        one_hot_target (None): Placeholder. It should be None.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if isinstance(alpha, list):
+        alpha = pred.new_tensor(alpha)
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    one_minus_pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * one_minus_pt.pow(gamma)
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    final_weight = torch.ones(1, pred.size(1)).type_as(loss)
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       one_hot_target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.5,
+                       class_weight=None,
+                       valid_mask=None,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A warpper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction. It's shape
+            should be (N, )
+        one_hot_target (torch.Tensor): The learning label with shape (N, C)
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    final_weight = torch.ones(1, pred.size(1)).type_as(pred)
+    if isinstance(alpha, list):
+        # _sigmoid_focal_loss doesn't accept alpha of list type. Therefore, if
+        # a list is given, we set the input alpha as 0.5. This means setting
+        # equal weight for foreground class and background class. By
+        # multiplying the loss by 2, the effect of setting alpha as 0.5 is
+        # undone. The alpha of type list is used to regulate the loss in the
+        # post-processing process.
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, 0.5, None, 'none') * 2
+        alpha = pred.new_tensor(alpha)
+        final_weight = final_weight * (
+            alpha * one_hot_target + (1 - alpha) * (1 - one_hot_target))
+    else:
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.5,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_focal'):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float | list[float], optional): A balanced form for Focal
+                Loss. Defaults to 0.5. When a list is provided, the length
+                of the list should be equal to the number of classes.
+                Please be careful that this parameter is not the
+                class-wise weight but the weight of a binary classification
+                problem. This binary classification problem regards the
+                pixels which belong to one class as the foreground
+                and the other pixels as the background, each element in
+                the list is the weight of the corresponding foreground class.
+                The value of alpha or each element of alpha should be a float
+                in the interval [0, 1]. If you want to specify the class-wise
+                weight, please use `class_weight` parameter.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_focal'.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, \
+            'AssertionError: Only sigmoid focal loss supported now.'
+        assert reduction in ('none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert isinstance(alpha, (float, list)), \
+            'AssertionError: alpha should be of type float'
+        assert isinstance(gamma, float), \
+            'AssertionError: gamma should be of type float'
+        assert isinstance(loss_weight, float), \
+            'AssertionError: loss_weight should be of type float'
+        assert isinstance(loss_name, str), \
+            'AssertionError: loss_name should be of type str'
+        assert isinstance(class_weight, list) or class_weight is None, \
+            'AssertionError: class_weight must be None or of type list'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.class_weight = class_weight
+        self.loss_weight = loss_weight
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape
+                (N, C) where C = number of classes, or
+                (N, C, d_1, d_2, ..., d_K) with K≥1 in the
+                case of K-dimensional loss.
+            target (torch.Tensor): The ground truth. If containing class
+                indices, shape (N) where each value is 0≤targets[i]≤C−1,
+                or (N, d_1, d_2, ..., d_K) with K≥1 in the case of
+                K-dimensional loss. If containing class probabilities,
+                same shape as the input.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert isinstance(ignore_index, int), \
+            'ignore_index must be of type int'
+        assert reduction_override in (None, 'none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert pred.shape == target.shape or \
+               (pred.size(0) == target.size(0) and
+                pred.shape[2:] == target.shape[1:]), \
+               "The shape of pred doesn't match the shape of target"
+
+        original_shape = pred.shape
+
+        # [B, C, d_1, d_2, ..., d_k] -> [C, B, d_1, d_2, ..., d_k]
+        pred = pred.transpose(0, 1)
+        # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+        pred = pred.reshape(pred.size(0), -1)
+        # [C, N] -> [N, C]
+        pred = pred.transpose(0, 1).contiguous()
+
+        if original_shape == target.shape:
+            # target with shape [B, C, d_1, d_2, ...]
+            # transform it's shape into [N, C]
+            # [B, C, d_1, d_2, ...] -> [C, B, d_1, d_2, ..., d_k]
+            target = target.transpose(0, 1)
+            # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+            target = target.reshape(target.size(0), -1)
+            # [C, N] -> [N, C]
+            target = target.transpose(0, 1).contiguous()
+        else:
+            # target with shape [B, d_1, d_2, ...]
+            # transform it's shape into [N, ]
+            target = target.view(-1).contiguous()
+            valid_mask = (target != ignore_index).view(-1, 1)
+            # avoid raising error when using F.one_hot()
+            target = torch.where(target == ignore_index, target.new_tensor(0),
+                                 target)
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            num_classes = pred.size(1)
+            if torch.npu.is_available() and pred.is_npu:
+                if target.dim() == 1:
+                    one_hot_target = F.one_hot(target, num_classes=num_classes)
+                else:
+                    one_hot_target = target
+                    target = target.argmax(dim=1)
+                    valid_mask = (target != ignore_index).view(-1, 1)
+                calculate_loss_func = sigmoid_focal_loss
+            else:
+                one_hot_target = None
+                if target.dim() == 1:
+                    target = F.one_hot(target, num_classes=num_classes)
+                else:
+                    valid_mask = (target.argmax(dim=1) != ignore_index).view(
+                        -1, 1)
+                calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                one_hot_target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                class_weight=self.class_weight,
+                valid_mask=valid_mask,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+            if reduction == 'none':
+                # [N, C] -> [C, N]
+                loss_cls = loss_cls.transpose(0, 1)
+                # [C, N] -> [C, B, d1, d2, ...]
+                # original_shape: [B, C, d1, d2, ...]
+                loss_cls = loss_cls.reshape(original_shape[1],
+                                            original_shape[0],
+                                            *original_shape[2:])
+                # [C, B, d1, d2, ...] -> [B, C, d1, d2, ...]
+                loss_cls = loss_cls.transpose(0, 1).contiguous()
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/lovasz_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/lovasz_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..070691f7e35b27384dde3044187780c91356de23
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/lovasz_loss.py
@@ -0,0 +1,336 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor
+ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
+Berman 2018 ESAT-PSI KU Leuven (MIT License)"""
+
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def lovasz_grad(gt_sorted):
+    """Computes gradient of the Lovasz extension w.r.t sorted errors.
+
+    See Alg. 1 in paper.
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def flatten_binary_logits(logits, labels, ignore_index=None):
+    """Flattens predictions in the batch (binary case) Remove labels equal to
+    'ignore_index'."""
+    logits = logits.view(-1)
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return logits, labels
+    valid = (labels != ignore_index)
+    vlogits = logits[valid]
+    vlabels = labels[valid]
+    return vlogits, vlabels
+
+
+def flatten_probs(probs, labels, ignore_index=None):
+    """Flattens predictions in the batch."""
+    if probs.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probs.size()
+        probs = probs.view(B, 1, H, W)
+    B, C, H, W = probs.size()
+    probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B*H*W, C=P,C
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return probs, labels
+    valid = (labels != ignore_index)
+    vprobs = probs[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobs, vlabels
+
+
+def lovasz_hinge_flat(logits, labels):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [P], logits at each prediction
+            (between -infty and +infty).
+        labels (torch.Tensor): [P], binary ground truth labels (0 or 1).
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * signs)
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), grad)
+    return loss
+
+
+def lovasz_hinge(logits,
+                 labels,
+                 classes='present',
+                 per_image=False,
+                 class_weight=None,
+                 reduction='mean',
+                 avg_factor=None,
+                 ignore_index=255):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [B, H, W], logits at each pixel
+            (between -infty and +infty).
+        labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1).
+        classes (str | list[int], optional): Placeholder, to be consistent with
+            other loss. Default: None.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): Placeholder, to be consistent
+            with other loss. Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if per_image:
+        loss = [
+            lovasz_hinge_flat(*flatten_binary_logits(
+                logit.unsqueeze(0), label.unsqueeze(0), ignore_index))
+            for logit, label in zip(logits, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_hinge_flat(
+            *flatten_binary_logits(logits, labels, ignore_index))
+    return loss
+
+
+def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [P, C], class probabilities at each prediction
+            (between 0 and 1).
+        labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if probs.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probs * 0.
+    C = probs.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes == 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probs[:, 0]
+        else:
+            class_pred = probs[:, c]
+        errors = (fg - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted))
+        if class_weight is not None:
+            loss *= class_weight[c]
+        losses.append(loss)
+    return torch.stack(losses).mean()
+
+
+def lovasz_softmax(probs,
+                   labels,
+                   classes='present',
+                   per_image=False,
+                   class_weight=None,
+                   reduction='mean',
+                   avg_factor=None,
+                   ignore_index=255):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [B, C, H, W], class probabilities at each
+            prediction (between 0 and 1).
+        labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and
+            C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+
+    if per_image:
+        loss = [
+            lovasz_softmax_flat(
+                *flatten_probs(
+                    prob.unsqueeze(0), label.unsqueeze(0), ignore_index),
+                classes=classes,
+                class_weight=class_weight)
+            for prob, label in zip(probs, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_softmax_flat(
+            *flatten_probs(probs, labels, ignore_index),
+            classes=classes,
+            class_weight=class_weight)
+    return loss
+
+
+@LOSSES.register_module()
+class LovaszLoss(nn.Module):
+    """LovaszLoss.
+
+    This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate
+    for the optimization of the intersection-over-union measure in neural
+    networks <https://arxiv.org/abs/1705.08790>`_.
+
+    Args:
+        loss_type (str, optional): Binary or multi-class loss.
+            Default: 'multi_class'. Options are "binary" and "multi_class".
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_lovasz'.
+    """
+
+    def __init__(self,
+                 loss_type='multi_class',
+                 classes='present',
+                 per_image=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_lovasz'):
+        super(LovaszLoss, self).__init__()
+        assert loss_type in ('binary', 'multi_class'), "loss_type should be \
+                                                    'binary' or 'multi_class'."
+
+        if loss_type == 'binary':
+            self.cls_criterion = lovasz_hinge
+        else:
+            self.cls_criterion = lovasz_softmax
+        assert classes in ('all', 'present') or mmcv.is_list_of(classes, int)
+        if not per_image:
+            assert reduction == 'none', "reduction should be 'none' when \
+                                                        per_image is False."
+
+        self.classes = classes
+        self.per_image = per_image
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        # if multi-class loss, transform logits to probs
+        if self.cls_criterion == lovasz_softmax:
+            cls_score = F.softmax(cls_score, dim=1)
+
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            self.classes,
+            self.per_image,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/tversky_loss.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/tversky_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..911b4df694adef758dc897f07763ca702d3a2c52
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/tversky_loss.py
@@ -0,0 +1,150 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from
+https://github.com/JunMa11/SegLoss/blob/master/losses_pytorch/dice_loss.py#L333
+(Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weighted_loss
+
+
+@weighted_loss
+def tversky_loss(pred,
+                 target,
+                 valid_mask,
+                 alpha=0.3,
+                 beta=0.7,
+                 smooth=1,
+                 class_weight=None,
+                 ignore_index=255):
+    assert pred.shape[0] == target.shape[0]
+    total_loss = 0
+    num_classes = pred.shape[1]
+    for i in range(num_classes):
+        if i != ignore_index:
+            tversky_loss = binary_tversky_loss(
+                pred[:, i],
+                target[..., i],
+                valid_mask=valid_mask,
+                alpha=alpha,
+                beta=beta,
+                smooth=smooth)
+            if class_weight is not None:
+                tversky_loss *= class_weight[i]
+            total_loss += tversky_loss
+    return total_loss / num_classes
+
+
+@weighted_loss
+def binary_tversky_loss(pred,
+                        target,
+                        valid_mask,
+                        alpha=0.3,
+                        beta=0.7,
+                        smooth=1):
+    assert pred.shape[0] == target.shape[0]
+    pred = pred.reshape(pred.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+
+    TP = torch.sum(torch.mul(pred, target) * valid_mask, dim=1)
+    FP = torch.sum(torch.mul(pred, 1 - target) * valid_mask, dim=1)
+    FN = torch.sum(torch.mul(1 - pred, target) * valid_mask, dim=1)
+    tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
+
+    return 1 - tversky
+
+
+@LOSSES.register_module()
+class TverskyLoss(nn.Module):
+    """TverskyLoss. This loss is proposed in `Tversky loss function for image
+    segmentation using 3D fully convolutional deep networks.
+
+    <https://arxiv.org/abs/1706.05721>`_.
+    Args:
+        smooth (float): A float number to smooth loss, and avoid NaN error.
+            Default: 1.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        alpha(float, in [0, 1]):
+            The coefficient of false positives. Default: 0.3.
+        beta (float, in [0, 1]):
+            The coefficient of false negatives. Default: 0.7.
+            Note: alpha + beta = 1.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_tversky'.
+    """
+
+    def __init__(self,
+                 smooth=1,
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 alpha=0.3,
+                 beta=0.7,
+                 loss_name='loss_tversky'):
+        super(TverskyLoss, self).__init__()
+        self.smooth = smooth
+        self.class_weight = get_class_weight(class_weight)
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+        assert (alpha + beta == 1.0), 'Sum of alpha and beta but be 1.0!'
+        self.alpha = alpha
+        self.beta = beta
+        self._loss_name = loss_name
+
+    def forward(self, pred, target, **kwargs):
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred = F.softmax(pred, dim=1)
+        num_classes = pred.shape[1]
+        one_hot_target = F.one_hot(
+            torch.clamp(target.long(), 0, num_classes - 1),
+            num_classes=num_classes)
+        valid_mask = (target != self.ignore_index).long()
+
+        loss = self.loss_weight * tversky_loss(
+            pred,
+            one_hot_target,
+            valid_mask=valid_mask,
+            alpha=self.alpha,
+            beta=self.beta,
+            smooth=self.smooth,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/utils.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..00776d16a2a5b1314e2a980f3d13393b368fb732
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/losses/utils.py
@@ -0,0 +1,139 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+def get_class_weight(class_weight):
+    """Get class weight for loss function.
+
+    Args:
+        class_weight (list[float] | str | None): If class_weight is a str,
+            take it as a file name and read from it.
+    """
+    if isinstance(class_weight, str):
+        # take it as a file path
+        if class_weight.endswith('.npy'):
+            class_weight = np.load(class_weight)
+        else:
+            # pkl, json or yaml
+            class_weight = mmcv.load(class_weight)
+
+    return class_weight
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        assert weight.dim() == loss.dim()
+        if weight.dim() > 1:
+            assert weight.size(1) == 1 or weight.size(1) == loss.size(1)
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e12fb3e78389a715dae169e77ffbf1f54d3b247
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .featurepyramid import Feature2Pyramid
+from .fpn import FPN
+from .ic_neck import ICNeck
+from .jpu import JPU
+from .mla_neck import MLANeck
+from .multilevel_neck import MultiLevelNeck
+
+__all__ = [
+    'FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck', 'JPU', 'Feature2Pyramid'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/featurepyramid.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/featurepyramid.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ad437e78fafd3c381aed29ed5f580b1c5b7f77
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/featurepyramid.py
@@ -0,0 +1,80 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class Feature2Pyramid(nn.Module):
+    """Feature2Pyramid.
+
+    A neck structure connect ViT backbone and decoder_heads.
+
+    Args:
+        embed_dims (int): Embedding dimension.
+        rescales (list[float]): Different sampling multiples were
+            used to obtain pyramid features. Default: [4, 2, 1, 0.5].
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 rescales=[4, 2, 1, 0.5],
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super(Feature2Pyramid, self).__init__()
+        self.rescales = rescales
+        self.upsample_4x = None
+        for k in self.rescales:
+            if k == 4:
+                self.upsample_4x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                    build_norm_layer(norm_cfg, embed_dim)[1],
+                    nn.GELU(),
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                )
+            elif k == 2:
+                self.upsample_2x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2))
+            elif k == 1:
+                self.identity = nn.Identity()
+            elif k == 0.5:
+                self.downsample_2x = nn.MaxPool2d(kernel_size=2, stride=2)
+            elif k == 0.25:
+                self.downsample_4x = nn.MaxPool2d(kernel_size=4, stride=4)
+            else:
+                raise KeyError(f'invalid {k} for feature2pyramid')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.rescales)
+        outputs = []
+        if self.upsample_4x is not None:
+            ops = [
+                self.upsample_4x, self.upsample_2x, self.identity,
+                self.downsample_2x
+            ]
+        else:
+            ops = [
+                self.upsample_2x, self.identity, self.downsample_2x,
+                self.downsample_4x
+            ]
+        for i in range(len(inputs)):
+            outputs.append(ops[i](inputs[i]))
+        return tuple(outputs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/fpn.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..adaf15ae35878438769e3cc8142ec689f08f2d87
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/fpn.py
@@ -0,0 +1,226 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmseg.ops import resize
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(BaseModule):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/ic_neck.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/ic_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff503f7dd574b30bbad7e31f6c0cbff646ab8568
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/ic_neck.py
@@ -0,0 +1,161 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import NECKS
+
+
+class CascadeFeatureFusion(BaseModule):
+    """Cascade Feature Fusion Unit in ICNet.
+
+    Args:
+        low_channels (int): The number of input channels for
+            low resolution feature map.
+        high_channels (int): The number of input channels for
+            high resolution feature map.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Returns:
+        x (Tensor): The output tensor of shape (N, out_channels, H, W).
+        x_low (Tensor): The output tensor of shape (N, out_channels, H, W)
+            for Cascade Label Guidance in auxiliary heads.
+    """
+
+    def __init__(self,
+                 low_channels,
+                 high_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super(CascadeFeatureFusion, self).__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.conv_low = ConvModule(
+            low_channels,
+            out_channels,
+            3,
+            padding=2,
+            dilation=2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_high = ConvModule(
+            high_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_low, x_high):
+        x_low = resize(
+            x_low,
+            size=x_high.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        # Note: Different from original paper, `x_low` is underwent
+        # `self.conv_low` rather than another 1x1 conv classifier
+        #  before being used for auxiliary head.
+        x_low = self.conv_low(x_low)
+        x_high = self.conv_high(x_high)
+        x = x_low + x_high
+        x = F.relu(x, inplace=True)
+        return x, x_low
+
+
+@NECKS.register_module()
+class ICNeck(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This head is the implementation of `ICHead
+    <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        in_channels (int): The number of input image channels. Default: 3.
+        out_channels (int): The numbers of output feature channels.
+            Default: 128.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(64, 256, 256),
+                 out_channels=128,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super(ICNeck, self).__init__(init_cfg=init_cfg)
+        assert len(in_channels) == 3, 'Length of input channels \
+                                        must be 3!'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.cff_24 = CascadeFeatureFusion(
+            self.in_channels[2],
+            self.in_channels[1],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+        self.cff_12 = CascadeFeatureFusion(
+            self.out_channels,
+            self.in_channels[0],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, inputs):
+        assert len(inputs) == 3, 'Length of input feature \
+                                        maps must be 3!'
+
+        x_sub1, x_sub2, x_sub4 = inputs
+        x_cff_24, x_24 = self.cff_24(x_sub4, x_sub2)
+        x_cff_12, x_12 = self.cff_12(x_cff_24, x_sub1)
+        # Note: `x_cff_12` is used for decode_head,
+        # `x_24` and `x_12` are used for auxiliary head.
+        return x_24, x_12, x_cff_12
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/jpu.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/jpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e369dc89a78a46b09af059e794986fc85559d5d1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/jpu.py
@@ -0,0 +1,144 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class JPU(BaseModule):
+    """FastFCN: Rethinking Dilated Convolution in the Backbone
+    for Semantic Segmentation.
+
+    This Joint Pyramid Upsampling (JPU) neck is the implementation of
+    `FastFCN <https://arxiv.org/abs/1903.11816>`_.
+
+    Args:
+        in_channels (Tuple[int], optional): The number of input channels
+            for each convolution operations before upsampling.
+            Default: (512, 1024, 2048).
+        mid_channels (int): The number of output channels of JPU.
+            Default: 512.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        dilations (tuple[int]): Dilation rate of each Depthwise
+            Separable ConvModule. Default: (1, 2, 4, 8).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(512, 1024, 2048),
+                 mid_channels=512,
+                 start_level=0,
+                 end_level=-1,
+                 dilations=(1, 2, 4, 8),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(JPU, self).__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, tuple)
+        assert isinstance(dilations, tuple)
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.start_level = start_level
+        self.num_ins = len(in_channels)
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+        else:
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+
+        self.dilations = dilations
+        self.align_corners = align_corners
+
+        self.conv_layers = nn.ModuleList()
+        self.dilation_layers = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            conv_layer = nn.Sequential(
+                ConvModule(
+                    self.in_channels[i],
+                    self.mid_channels,
+                    kernel_size=3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.conv_layers.append(conv_layer)
+        for i in range(len(dilations)):
+            dilation_layer = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=(self.backbone_end_level - self.start_level) *
+                    self.mid_channels,
+                    out_channels=self.mid_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilations[i],
+                    dilation=dilations[i],
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=act_cfg))
+            self.dilation_layers.append(dilation_layer)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels), 'Length of inputs must \
+                                           be the same with self.in_channels!'
+
+        feats = [
+            self.conv_layers[i - self.start_level](inputs[i])
+            for i in range(self.start_level, self.backbone_end_level)
+        ]
+
+        h, w = feats[0].shape[2:]
+        for i in range(1, len(feats)):
+            feats[i] = resize(
+                feats[i],
+                size=(h, w),
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        feat = torch.cat(feats, dim=1)
+        concat_feat = torch.cat([
+            self.dilation_layers[i](feat) for i in range(len(self.dilations))
+        ],
+                                dim=1)
+
+        outs = []
+
+        # Default: outs[2] is the output of JPU for decoder head, outs[1] is
+        # the feature map from backbone for auxiliary head. Additionally,
+        # outs[0] can also be used for auxiliary head.
+        for i in range(self.start_level, self.backbone_end_level - 1):
+            outs.append(inputs[i])
+        outs.append(concat_feat)
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/mla_neck.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/mla_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..707d3c7d49ae87536866b27ccb3649bc1fcfe3a4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/mla_neck.py
@@ -0,0 +1,131 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from ..builder import NECKS
+
+
+class MLAModule(nn.Module):
+
+    def __init__(self,
+                 in_channels=[1024, 1024, 1024, 1024],
+                 out_channels=256,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super(MLAModule, self).__init__()
+        self.channel_proj = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.channel_proj.append(
+                ConvModule(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.feat_extract = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.feat_extract.append(
+                ConvModule(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+
+        # feat_list -> [p2, p3, p4, p5]
+        feat_list = []
+        for x, conv in zip(inputs, self.channel_proj):
+            feat_list.append(conv(x))
+
+        # feat_list -> [p5, p4, p3, p2]
+        # mid_list -> [m5, m4, m3, m2]
+        feat_list = feat_list[::-1]
+        mid_list = []
+        for feat in feat_list:
+            if len(mid_list) == 0:
+                mid_list.append(feat)
+            else:
+                mid_list.append(mid_list[-1] + feat)
+
+        # mid_list -> [m5, m4, m3, m2]
+        # out_list -> [o2, o3, o4, o5]
+        out_list = []
+        for mid, conv in zip(mid_list, self.feat_extract):
+            out_list.append(conv(mid))
+
+        return tuple(out_list)
+
+
+@NECKS.register_module()
+class MLANeck(nn.Module):
+    """Multi-level Feature Aggregation.
+
+    This neck is `The Multi-level Feature Aggregation construction of
+    SETR <https://arxiv.org/abs/2012.15840>`_.
+
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 norm_cfg=None,
+                 act_cfg=None):
+        super(MLANeck, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # In order to build general vision transformer backbone, we have to
+        # move MLA to neck.
+        self.norm = nn.ModuleList([
+            build_norm_layer(norm_layer, in_channels[i])[1]
+            for i in range(len(in_channels))
+        ])
+
+        self.mla = MLAModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # Convert from nchw to nlc
+        outs = []
+        for i in range(len(inputs)):
+            x = inputs[i]
+            n, c, h, w = x.shape
+            x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+            x = self.norm[i](x)
+            x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+            outs.append(x)
+
+        outs = self.mla(outs)
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/multilevel_neck.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/multilevel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9072b45e0bff119e67fda78a9c978344d1dd65
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/necks/multilevel_neck.py
@@ -0,0 +1,91 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, xavier_init
+
+from mmseg.ops import resize
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class MultiLevelNeck(nn.Module):
+    """MultiLevelNeck.
+
+    A neck structure connect vit backbone and decoder_heads.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        scales (List[float]): Scale factors for each input feature map.
+            Default: [0.5, 1, 2, 4]
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scales=[0.5, 1, 2, 4],
+                 norm_cfg=None,
+                 act_cfg=None):
+        super(MultiLevelNeck, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scales = scales
+        self.num_outs = len(scales)
+        self.lateral_convs = nn.ModuleList()
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.lateral_convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        for _ in range(self.num_outs):
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+        inputs = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # for len(inputs) not equal to self.num_outs
+        if len(inputs) == 1:
+            inputs = [inputs[0] for _ in range(self.num_outs)]
+        outs = []
+        for i in range(self.num_outs):
+            x_resize = resize(
+                inputs[i], scale_factor=self.scales[i], mode='bilinear')
+            outs.append(self.convs[i](x_resize))
+        return tuple(outs)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e14d74afca4e18e8af54010537e431d63f9c896
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/__init__.py
@@ -0,0 +1,19 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import BaseSegmentor
+from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .encoder_decoder import EncoderDecoder
+
+__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/base.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e15c7e4676319326092107008c7cc053d2b16ad6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/base.py
@@ -0,0 +1,306 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseSegmentor(BaseModule, metaclass=ABCMeta):
+    """Base class for segmentors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseSegmentor, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got '
+                                f'{type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) != '
+                             f'num of image meta ({len(img_metas)})')
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        #print('-'*20,'\n',img_metas)
+        for img_meta in img_metas:
+            #print("img_meta.data", img_meta.data[0])
+            ori_shapes = [_['ori_shape'] for _ in img_meta.data[0]]
+            assert all(shape == ori_shapes[0] for shape in ori_shapes)
+            img_shapes = [_['img_shape'] for _ in img_meta.data[0]]
+            assert all(shape == img_shapes[0] for shape in img_shapes)
+            pad_shapes = [_['pad_shape'] for _ in img_meta.data[0]]
+            assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0].data[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer=None, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        log_vars_ = dict()
+        for loss_name, loss_value in log_vars.items():
+            k = loss_name + '_val'
+            log_vars_[k] = loss_value
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars_,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        # If the loss_vars has different length, raise assertion error
+        # to prevent GPUs from infinite waiting.
+        if dist.is_available() and dist.is_initialized():
+            log_var_length = torch.tensor(len(log_vars), device=loss.device, dtype=torch.int32)
+            dist.all_reduce(log_var_length)
+            message = (f'rank {dist.get_rank()}' +
+                       f' len(log_vars): {len(log_vars)}' + ' keys: ' +
+                       ','.join(log_vars.keys()) + '\n')
+            assert log_var_length == len(log_vars) * dist.get_world_size(), \
+                'loss log variables are different across GPUs!\n' + message
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return img
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/cascade_encoder_decoder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/cascade_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b820aa73492f7779c5be8efae1b7e01359030152
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -0,0 +1,102 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import nn
+
+from mmseg.core import add_prefix
+from mmseg.ops import resize
+from .. import builder
+from ..builder import SEGMENTORS
+from .encoder_decoder import EncoderDecoder
+
+
+@SEGMENTORS.register_module()
+class CascadeEncoderDecoder(EncoderDecoder):
+    """Cascade Encoder Decoder segmentors.
+
+    CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
+    CascadeEncoderDecoder are cascaded. The output of previous decoder_head
+    will be the input of next decoder_head.
+    """
+
+    def __init__(self,
+                 num_stages,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        self.num_stages = num_stages
+        super(CascadeEncoderDecoder, self).__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        assert isinstance(decode_head, list)
+        assert len(decode_head) == self.num_stages
+        self.decode_head = nn.ModuleList()
+        for i in range(self.num_stages):
+            self.decode_head.append(builder.build_head(decode_head[i]))
+        self.align_corners = self.decode_head[-1].align_corners
+        self.num_classes = self.decode_head[-1].num_classes
+        self.out_channels = self.decode_head[-1].out_channels
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg)
+        for i in range(1, self.num_stages):
+            out = self.decode_head[i].forward_test(x, out, img_metas,
+                                                   self.test_cfg)
+        out = resize(
+            input=out,
+            size=img.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+
+        loss_decode = self.decode_head[0].forward_train(
+            x, img_metas, gt_semantic_seg, self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode_0'))
+
+        for i in range(1, self.num_stages):
+            # forward test again, maybe unnecessary for most methods.
+            if i == 1:
+                prev_outputs = self.decode_head[0].forward_test(
+                    x, img_metas, self.test_cfg)
+            else:
+                prev_outputs = self.decode_head[i - 1].forward_test(
+                    x, prev_outputs, img_metas, self.test_cfg)
+            loss_decode = self.decode_head[i].forward_train(
+                x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_decode, f'decode_{i}'))
+
+        return losses
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/encoder_decoder.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..719adb6041826723111c68ad569a5f60dc2ac680
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/segmentors/encoder_decoder.py
@@ -0,0 +1,315 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.core import add_prefix
+from mmseg.ops import resize
+from .. import builder
+from ..builder import SEGMENTORS
+from .base import BaseSegmentor
+
+
+@SEGMENTORS.register_module()
+class EncoderDecoder(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(EncoderDecoder, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(builder.build_head(head_cfg))
+            else:
+                self.auxiliary_head = builder.build_head(auxiliary_head)
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        out = resize(
+            input=out,
+            size=img.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas,
+                                                     gt_semantic_seg,
+                                                     self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas,
+                                                  gt_semantic_seg,
+                                                  self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(
+                x, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(img, None)
+
+        return seg_logit
+
+    def forward_train(self, img, img_metas, gt_semantic_seg):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas,
+                                                      gt_semantic_seg)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, img_metas, gt_semantic_seg)
+            losses.update(loss_aux)
+
+        return losses
+
+    # TODO refactor
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                crop_seg_logit = self.encode_decode(crop_img, img_meta)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(
+                count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+        if rescale:
+            # remove padding area
+            resize_shape = img_meta[0]['img_shape'][:2]
+            preds = preds[:, :, :resize_shape[0], :resize_shape[1]]
+            preds = resize(
+                preds,
+                size=img_meta[0]['ori_shape'][:2],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+        return preds
+
+    def whole_inference(self, img, img_meta, rescale):
+        """Inference with full image."""
+
+        seg_logit = self.encode_decode(img, img_meta)
+        if rescale:
+            # support dynamic shape for onnx
+            if torch.onnx.is_in_onnx_export():
+                size = img.shape[2:]
+            else:
+                # remove padding area
+                resize_shape = img_meta[0]['img_shape'][:2]
+                seg_logit = seg_logit[:, :, :resize_shape[0], :resize_shape[1]]
+                size = img_meta[0]['ori_shape'][:2]
+            seg_logit = resize(
+                seg_logit,
+                size=size,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+
+        return seg_logit
+
+    def inference(self, img, img_meta, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = img_meta[0]['ori_shape']
+        assert all(_['ori_shape'] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(img, img_meta, rescale)
+        else:
+            seg_logit = self.whole_inference(img, img_meta, rescale)
+        if self.out_channels == 1:
+            output = F.sigmoid(seg_logit)
+        else:
+            output = F.softmax(seg_logit, dim=1)
+        flip = img_meta[0]['flip']
+        if flip:
+            flip_direction = img_meta[0]['flip_direction']
+            assert flip_direction in ['horizontal', 'vertical']
+            if flip_direction == 'horizontal':
+                output = output.flip(dims=(3, ))
+            elif flip_direction == 'vertical':
+                output = output.flip(dims=(2, ))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        seg_logit = self.inference(img, img_meta, rescale)
+        if self.out_channels == 1:
+            seg_pred = (seg_logit >
+                        self.decode_head.threshold).to(seg_logit).squeeze(1)
+        else:
+            seg_pred = seg_logit.argmax(dim=1)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            seg_pred = seg_pred.unsqueeze(0)
+            return seg_pred
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(imgs)
+        if self.out_channels == 1:
+            seg_pred = (seg_logit >
+                        self.decode_head.threshold).to(seg_logit).squeeze(1)
+        else:
+            seg_pred = seg_logit.argmax(dim=1)
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9063c9e423c33e37901ea53b8af1f3c0fd6bb4a0
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/__init__.py
@@ -0,0 +1,29 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .embed import PatchEmbed
+from .inverted_residual import InvertedResidual, InvertedResidualV3
+from .make_divisible import make_divisible
+from .res_layer import ResLayer
+from .se_layer import SELayer
+from .self_attention_block import SelfAttentionBlock
+from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
+                            nlc_to_nchw)
+from .up_conv_block import UpConvBlock
+
+__all__ = [
+    'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
+    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'PatchEmbed',
+    'nchw_to_nlc', 'nlc_to_nchw', 'nchw2nlc2nchw', 'nlc2nchw2nlc'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/embed.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..5648e618afc9166e8692cfe239ed085d81c3dc3d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/embed.py
@@ -0,0 +1,345 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Sequence
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+        #print("x", x)
+        x = x.npu()
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/inverted_residual.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c9d0dd0304b9622f152bb66df340a37226307c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/inverted_residual.py
@@ -0,0 +1,226 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.cnn import ConvModule
+from torch import nn
+from torch.utils import checkpoint as cp
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        dilation (int): Dilation rate of depthwise conv. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False,
+                 **kwargs):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **kwargs))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **kwargs),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                **kwargs)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            else:
+                return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InvertedResidualV3(nn.Module):
+    """Inverted Residual Block for MobileNetV3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels. Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super(InvertedResidualV3, self).__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=dict(
+                type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/make_divisible.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..070c8967934870c549cb2067d0fff7bfe53ea282
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/make_divisible.py
@@ -0,0 +1,41 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/res_layer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd2cf6c9452a7b3e3fd820fdd511fd3e77a41d3
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/res_layer.py
@@ -0,0 +1,109 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        multi_grid (int | None): Multi grid dilation rates of last
+            stage. Default: None
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 dilation=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 multi_grid=None,
+                 contract_dilation=False,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if multi_grid is None:
+            if dilation > 1 and contract_dilation:
+                first_dilation = dilation // 2
+            else:
+                first_dilation = dilation
+        else:
+            first_dilation = multi_grid[0]
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                dilation=first_dilation,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    dilation=dilation if multi_grid is None else multi_grid[i],
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super(ResLayer, self).__init__(*layers)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/se_layer.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9379776c318a2b41c2f0f3e249d77e835457f6d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/se_layer.py
@@ -0,0 +1,71 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mmcv
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from .make_divisible import make_divisible
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configured
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configured by the first dict and the
+            second activation layer will be configured by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0)).
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0))):
+        super(SELayer, self).__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=make_divisible(channels // ratio, 8),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=make_divisible(channels // ratio, 8),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/self_attention_block.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/self_attention_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f70ab6f5845bccfd69afafcc0dea8283db4df9
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/self_attention_block.py
@@ -0,0 +1,173 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from mmcv.cnn import ConvModule, constant_init
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class SelfAttentionBlock(nn.Module):
+    """General self-attention block/non-local block.
+
+    Please refer to https://arxiv.org/abs/1706.03762 for details about key,
+    query and value.
+
+    Args:
+        key_in_channels (int): Input channels of key feature.
+        query_in_channels (int): Input channels of query feature.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_downsample (nn.Module): Query downsample module.
+        key_downsample (nn.Module): Key downsample module.
+        key_query_num_convs (int): Number of convs for key/query projection.
+        value_num_convs (int): Number of convs for value projection.
+        matmul_norm (bool): Whether normalize attention map with sqrt of
+            channels
+        with_out (bool): Whether use out projection.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, key_in_channels, query_in_channels, channels,
+                 out_channels, share_key_query, query_downsample,
+                 key_downsample, key_query_num_convs, value_out_num_convs,
+                 key_query_norm, value_out_norm, matmul_norm, with_out,
+                 conv_cfg, norm_cfg, act_cfg):
+        super(SelfAttentionBlock, self).__init__()
+        if share_key_query:
+            assert key_in_channels == query_in_channels
+        self.key_in_channels = key_in_channels
+        self.query_in_channels = query_in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.share_key_query = share_key_query
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.key_project = self.build_project(
+            key_in_channels,
+            channels,
+            num_convs=key_query_num_convs,
+            use_conv_module=key_query_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if share_key_query:
+            self.query_project = self.key_project
+        else:
+            self.query_project = self.build_project(
+                query_in_channels,
+                channels,
+                num_convs=key_query_num_convs,
+                use_conv_module=key_query_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.value_project = self.build_project(
+            key_in_channels,
+            channels if with_out else out_channels,
+            num_convs=value_out_num_convs,
+            use_conv_module=value_out_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if with_out:
+            self.out_project = self.build_project(
+                channels,
+                out_channels,
+                num_convs=value_out_num_convs,
+                use_conv_module=value_out_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.out_project = None
+
+        self.query_downsample = query_downsample
+        self.key_downsample = key_downsample
+        self.matmul_norm = matmul_norm
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weight of later layer."""
+        if self.out_project is not None:
+            if not isinstance(self.out_project, ConvModule):
+                constant_init(self.out_project, 0)
+
+    def build_project(self, in_channels, channels, num_convs, use_conv_module,
+                      conv_cfg, norm_cfg, act_cfg):
+        """Build projection layer for key/query/value/out."""
+        if use_conv_module:
+            convs = [
+                ConvModule(
+                    in_channels,
+                    channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+            ]
+            for _ in range(num_convs - 1):
+                convs.append(
+                    ConvModule(
+                        channels,
+                        channels,
+                        1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+        else:
+            convs = [nn.Conv2d(in_channels, channels, 1)]
+            for _ in range(num_convs - 1):
+                convs.append(nn.Conv2d(channels, channels, 1))
+        if len(convs) > 1:
+            convs = nn.Sequential(*convs)
+        else:
+            convs = convs[0]
+        return convs
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        batch_size = query_feats.size(0)
+        query = self.query_project(query_feats)
+        if self.query_downsample is not None:
+            query = self.query_downsample(query)
+        query = query.reshape(*query.shape[:2], -1)
+        query = query.permute(0, 2, 1).contiguous()
+
+        key = self.key_project(key_feats)
+        value = self.value_project(key_feats)
+        if self.key_downsample is not None:
+            key = self.key_downsample(key)
+            value = self.key_downsample(value)
+        key = key.reshape(*key.shape[:2], -1)
+        value = value.reshape(*value.shape[:2], -1)
+        value = value.permute(0, 2, 1).contiguous()
+
+        sim_map = torch.matmul(query, key)
+        if self.matmul_norm:
+            sim_map = (self.channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.reshape(batch_size, -1, *query_feats.shape[2:])
+        if self.out_project is not None:
+            context = self.out_project(context)
+        return context
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/shape_convert.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/shape_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c00f6d59d5e25849306bec15f3f33340ff59308
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/shape_convert.py
@@ -0,0 +1,120 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
+    """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
+    reshaped tensor as the input of `module`, and the convert the output of
+    `module`, whose shape is.
+
+    [N, L, C], to [N, C, H, W].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, L, C] as input.
+        x (Tensor): The input tensor of shape [N, C, H, W].
+                contiguous:
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> norm = nn.LayerNorm(4)
+        >>> feature_map = torch.rand(4, 4, 5, 5)
+        >>> output = nchw2nlc2nchw(norm, feature_map)
+    """
+    B, C, H, W = x.shape
+    if not contiguous:
+        x = x.flatten(2).transpose(1, 2)
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+    else:
+        x = x.flatten(2).transpose(1, 2).contiguous()
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+    return x
+
+
+def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
+    """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
+    reshaped tensor as the input of `module`, and convert the output of
+    `module`, whose shape is.
+
+    [N, C, H, W], to [N, L, C].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, C, H, W] as input.
+        x (Tensor): The input tensor of shape [N, L, C].
+        hw_shape: (Sequence[int]): The height and width of the
+            feature map with shape [N, C, H, W].
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
+        >>> feature_map = torch.rand(4, 25, 16)
+        >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    if not contiguous:
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2)
+    else:
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2).contiguous()
+    return x
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/up_conv_block.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/up_conv_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc7d1248421deb5bb9c0d6054f2db120c93fcf2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/models/utils/up_conv_block.py
@@ -0,0 +1,115 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+
+
+class UpConvBlock(nn.Module):
+    """Upsample convolution block in decoder for UNet.
+
+    This upsample convolution block consists of one upsample module
+    followed by one convolution block. The upsample module expands the
+    high-level low-resolution feature map and the convolution block fuses
+    the upsampled high-level low-resolution feature map and the low-level
+    high-resolution feature map from encoder.
+
+    Args:
+        conv_block (nn.Sequential): Sequential of convolutional layers.
+        in_channels (int): Number of input channels of the high-level
+        skip_channels (int): Number of input channels of the low-level
+        high-resolution feature map from encoder.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers in the conv_block.
+            Default: 2.
+        stride (int): Stride of convolutional layer in conv_block. Default: 1.
+        dilation (int): Dilation rate of convolutional layer in conv_block.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv'). If the size of
+            high-level feature map is the same as that of skip feature map
+            (low-level feature map from encoder), it does not need upsample the
+            high-level feature map and the upsample_cfg is None.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 conv_block,
+                 in_channels,
+                 skip_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 dcn=None,
+                 plugins=None):
+        super(UpConvBlock, self).__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.conv_block = conv_block(
+            in_channels=2 * skip_channels,
+            out_channels=out_channels,
+            num_convs=num_convs,
+            stride=stride,
+            dilation=dilation,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dcn=None,
+            plugins=None)
+        if upsample_cfg is not None:
+            self.upsample = build_upsample_layer(
+                cfg=upsample_cfg,
+                in_channels=in_channels,
+                out_channels=skip_channels,
+                with_cp=with_cp,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.upsample = ConvModule(
+                in_channels,
+                skip_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def forward(self, skip, x):
+        """Forward function."""
+
+        x = self.upsample(x)
+        out = torch.cat([skip, x], dim=1)
+        out = self.conv_block(out)
+
+        return out
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48162baa64c82ab5be48474aeb37f2e211b47dca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/__init__.py
@@ -0,0 +1,18 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .encoding import Encoding
+from .wrappers import Upsample, resize
+
+__all__ = ['Upsample', 'resize', 'Encoding']
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/encoding.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..630aa557ea390c54c895148e57a55af61eddaae6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/encoding.py
@@ -0,0 +1,88 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Encoding(nn.Module):
+    """Encoding Layer: a learnable residual encoder.
+
+    Input is of shape  (batch_size, channels, height, width).
+    Output is of shape (batch_size, num_codes, channels).
+
+    Args:
+        channels: dimension of the features or feature channels
+        num_codes: number of code words
+    """
+
+    def __init__(self, channels, num_codes):
+        super(Encoding, self).__init__()
+        # init codewords and smoothing factor
+        self.channels, self.num_codes = channels, num_codes
+        std = 1. / ((num_codes * channels)**0.5)
+        # [num_codes, channels]
+        self.codewords = nn.Parameter(
+            torch.empty(num_codes, channels,
+                        dtype=torch.float).uniform_(-std, std),
+            requires_grad=True)
+        # [num_codes]
+        self.scale = nn.Parameter(
+            torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0),
+            requires_grad=True)
+
+    @staticmethod
+    def scaled_l2(x, codewords, scale):
+        num_codes, channels = codewords.size()
+        batch_size = x.size(0)
+        reshaped_scale = scale.view((1, 1, num_codes))
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+
+        scaled_l2_norm = reshaped_scale * (
+            expanded_x - reshaped_codewords).pow(2).sum(dim=3)
+        return scaled_l2_norm
+
+    @staticmethod
+    def aggregate(assignment_weights, x, codewords):
+        num_codes, channels = codewords.size()
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+        batch_size = x.size(0)
+
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        encoded_feat = (assignment_weights.unsqueeze(3) *
+                        (expanded_x - reshaped_codewords)).sum(dim=1)
+        return encoded_feat
+
+    def forward(self, x):
+        assert x.dim() == 4 and x.size(1) == self.channels
+        # [batch_size, channels, height, width]
+        batch_size = x.size(0)
+        # [batch_size, height x width, channels]
+        x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous()
+        # assignment_weights: [batch_size, channels, num_codes]
+        assignment_weights = F.softmax(
+            self.scaled_l2(x, self.codewords, self.scale), dim=2)
+        # aggregate
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \
+                    f'x{self.channels})'
+        return repr_str
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/wrappers.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b48b6a802fc7ac09dace057aa36775d304a5ee2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/ops/wrappers.py
@@ -0,0 +1,64 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0a8ba064b1ec8a7a3450d6ad664791bc8b8f806
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/__init__.py
@@ -0,0 +1,24 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .collect_env import collect_env
+from .logger import get_root_logger
+from .misc import find_latest_checkpoint
+from .set_env import setup_multi_processes
+from .util_distribution import build_ddp, build_dp, get_device
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'find_latest_checkpoint',
+    'setup_multi_processes', 'build_ddp', 'build_dp', 'get_device'
+]
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/collect_env.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..7caadd15f340f9e9812b6ad47b25ca5a38f5ce68
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/collect_env.py
@@ -0,0 +1,31 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv.utils import collect_env as collect_base_env
+from mmcv.utils import get_git_hash
+
+import mmseg
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
+
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print('{}: {}'.format(name, val))
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/logger.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e95a10b49a17a334e28e18d4fc3e5c75b108b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/logger.py
@@ -0,0 +1,41 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmseg".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+
+    logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
+
+    return logger
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/misc.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7c201ef9c55df33d1b6cc86864e33988185e625
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/misc.py
@@ -0,0 +1,54 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os.path as osp
+import warnings
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """This function is for finding the latest checkpoint.
+
+    It will be used when automatically resume, modified from
+    https://github.com/open-mmlab/mmdetection/blob/dev-v2.20.0/mmdet/utils/misc.py
+
+    Args:
+        path (str): The path to find checkpoints.
+        suffix (str): File extension for the checkpoint. Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    """
+    if not osp.exists(path):
+        warnings.warn("The path of the checkpoints doesn't exist.")
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('The are no checkpoints in the path')
+        return None
+    latest = -1
+    latest_path = ''
+    for checkpoint in checkpoints:
+        if len(checkpoint) < len(latest_path):
+            continue
+        # `count` is iteration number, as checkpoints are saved as
+        # 'iter_xx.pth' or 'epoch_xx.pth' and xx is iteration number.
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/set_env.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/set_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ecfdc9e58bb60beeb50095ba34c9477b46a7a1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/set_env.py
@@ -0,0 +1,68 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import platform
+
+import cv2
+import torch.multiprocessing as mp
+
+from ..utils import get_root_logger
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    logger = get_root_logger()
+
+    # set multi-process start method
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', None)
+        current_method = mp.get_start_method(allow_none=True)
+        if mp_start_method in ('fork', 'spawn', 'forkserver'):
+            logger.info(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`.')
+            mp.set_start_method(mp_start_method, force=True)
+        else:
+            logger.info(
+                f'Multi-processing start method is `{mp_start_method}`')
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', None)
+    if isinstance(opencv_num_threads, int):
+        logger.info(f'OpenCV num_threads is `{opencv_num_threads}`')
+        cv2.setNumThreads(opencv_num_threads)
+    else:
+        logger.info(f'OpenCV num_threads is `{cv2.getNumThreads()}')
+
+    if cfg.data.workers_per_gpu > 1:
+        # setup OMP threads
+        # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+        omp_num_threads = cfg.get('omp_num_threads', None)
+        if 'OMP_NUM_THREADS' not in os.environ:
+            if isinstance(omp_num_threads, int):
+                logger.info(f'OMP num threads is {omp_num_threads}')
+                os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+        else:
+            logger.info(f'OMP num threads is {os.environ["OMP_NUM_THREADS"] }')
+
+        # setup MKL threads
+        if 'MKL_NUM_THREADS' not in os.environ:
+            mkl_num_threads = cfg.get('mkl_num_threads', None)
+            if isinstance(mkl_num_threads, int):
+                logger.info(f'MKL num threads is {mkl_num_threads}')
+                os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+        else:
+            logger.info(f'MKL num threads is {os.environ["MKL_NUM_THREADS"]}')
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/util_distribution.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/util_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ec7cc55a83bc5ead89b0afed7ca26ed50bff99b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/utils/util_distribution.py
@@ -0,0 +1,94 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mmcv
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+
+from mmseg import digit_version
+
+dp_factory = {'npu': MMDataParallel, 'cpu': MMDataParallel}
+
+ddp_factory = {'npu': MMDistributedDataParallel}
+
+
+def build_dp(model, device='cuda', dim=0, *args, **kwargs):
+    """build DataParallel module by device type.
+
+    if device is cuda, return a MMDataParallel module; if device is mlu,
+    return a MLUDataParallel module.
+
+    Args:
+        model (:class:`nn.Module`): module to be parallelized.
+        device (str): device type, cuda, cpu or mlu. Defaults to cuda.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+
+    Returns:
+        :class:`nn.Module`: parallelized module.
+    """
+    if device == 'npu':
+        model = model.npu()
+    elif device == 'mlu':
+        assert digit_version(mmcv.__version__) >= digit_version('1.5.0'), \
+                'Please use MMCV >= 1.5.0 for MLU training!'
+        from mmcv.device.mlu import MLUDataParallel
+        dp_factory['mlu'] = MLUDataParallel
+        model = model.mlu()
+
+    return dp_factory[device](model, dim=dim, *args, **kwargs)
+
+
+def build_ddp(model, device='cuda', *args, **kwargs):
+    """Build DistributedDataParallel module by device type.
+
+    If device is cuda, return a MMDistributedDataParallel module;
+    if device is mlu, return a MLUDistributedDataParallel module.
+
+    Args:
+        model (:class:`nn.Module`): module to be parallelized.
+        device (str): device type, mlu or cuda.
+
+    Returns:
+        :class:`nn.Module`: parallelized module.
+
+    References:
+        .. [1] https://pytorch.org/docs/stable/generated/torch.nn.parallel.
+                     DistributedDataParallel.html
+    """
+    assert device in ['npu', 'mlu'], 'Only available for cuda or mlu devices.'
+    if device == 'npu':
+        model = model.npu()
+    elif device == 'mlu':
+        assert digit_version(mmcv.__version__) >= digit_version('1.5.0'), \
+            'Please use MMCV >= 1.5.0 for MLU training!'
+        from mmcv.device.mlu import MLUDistributedDataParallel
+        ddp_factory['mlu'] = MLUDistributedDataParallel
+        model = model.mlu()
+
+    return ddp_factory[device](model, *args, **kwargs)
+
+
+def is_mlu_available():
+    """Returns a bool indicating if MLU is currently available."""
+    return hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()
+
+
+def get_device():
+    """Returns an available device, cpu, cuda or mlu."""
+    is_device_available = {
+        'npu': torch.npu.is_available(),
+        'mlu': is_mlu_available()
+    }
+    device_list = [k for k, v in is_device_available.items() if v]
+    return device_list[0] if len(device_list) == 1 else 'cpu'
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/version.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b952b5167b3d74293c4a61f4e173a4f2c44281a5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/mmseg/version.py
@@ -0,0 +1,31 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. All rights reserved.
+
+__version__ = '0.28.0'
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/modelzoo_level.txt b/PyTorch/contrib/cv/semantic_segmentation/DPT/modelzoo_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5afcef9188bf9d39f1e34b45bd91324c6093137a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:NOK
+PrecisionStatus:POK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/requirements.txt b/PyTorch/contrib/cv/semantic_segmentation/DPT/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc711b856d2f287507aaa2008ce9dbe3f6616b1b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/requirements.txt
@@ -0,0 +1,14 @@
+addict==2.4.0
+apex
+decorator==5.1.1
+matplotlib==3.5.2
+numpy==1.21.6
+packaging==21.3
+Pillow==9.1.1
+pyparsing==3.0.9
+prettytable==3.3.0
+pytz==2022.2.1
+PyYAML==6.0
+sympy==1.10.1
+yapf==0.32.0
+mmcls==0.23.1
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1cbc994ac842ef5548053e1bfe30acc5fc3eeaf7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_1p.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+######################基础配置参数，需要模型审视修改########################
+# 网络名称
+Network="DPT"
+# 训练batch_size
+batch_size=2
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+export RANK=0
+# 配置环境变量
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=23333
+
+# 路径参数初始化
+# 配置dataset的路径
+data_path=""
+# 配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+fi
+
+######################指定训练脚本执行路径##############################
+nohup python3 -m torch.distributed.launch --nproc_per_node=1 ./mmsegmentation/tools/train.py ./mmsegmentation/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py \
+    --work-dir=${cur_path}/output/$ASCEND_DEVICE_ID \
+    --options data_root=${data_path} data.train.data_root=${data_path} data.val.data_root=${data_path} data.test.data_root=${data_path} \
+    > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+FPS=`grep -a ' time: '  ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F " time: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{mean_time = $1} END {print '$RANK_SIZE'*'$batch_size'/mean_time}'`
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+mIoU=`grep -a 'mIoU: ' ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F "mIoU: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'`
+#打印，不需要修改
+echo "Final mIoU : ${mIoU}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${RANK_SIZE}'*'${FPS}'}'`
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'Iter ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep eta:|awk -F "loss: " '{print $NF}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "mIoU = ${mIoU}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..637e01631430baed9ac18548f8700927ae2c860a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_full_8p.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+######################基础配置参数，需要模型审视修改###################
+# 网络名称
+Network="DPT"
+# 训练batch_size
+batch_size=2
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+# 配置环境变量
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=23333
+
+# 路径参数初始化
+# 配置dataset的路径
+data_path=""
+# 配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+######################执行训练脚本##############################
+nohup python3 -m torch.distributed.launch --nproc_per_node=8 ./mmsegmentation/tools/train.py ./mmsegmentation/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py \
+    --work-dir=${cur_path}/output/$ASCEND_DEVICE_ID \
+    --options data_root=${data_path} data.train.data_root=${data_path} data.val.data_root=${data_path} data.test.data_root=${data_path} \
+    > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+
+##########################获取训练数据##################################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+FPS=`grep -a ' time: '  ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F " time: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{mean_time = $1} END {print '$RANK_SIZE'*'$batch_size'/mean_time}'`
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+mIoU=`grep -a 'mIoU: ' ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F "mIoU: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'`
+#打印，不需要修改
+echo "Final mIoU : ${mIoU}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${FPS}'}'`
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'Iter ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep eta:|awk -F "loss: " '{print $NF}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "mIoU = ${mIoU}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..195731d4d8113d0591cd2a4e2c86761ed1397f9e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_1p.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+######################基础配置参数，需要模型审视修改########################
+# 网络名称
+Network="DPT"
+# 训练batch_size
+batch_size=2
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+export RANK=0
+# 配置环境变量
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=23333
+
+# 路径参数初始化
+# 配置dataset的路径
+data_path=""
+# 配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+######################指定训练脚本执行路径##############################
+nohup python3 -m torch.distributed.launch --nproc_per_node=1 ./mmsegmentation/tools/train.py ./mmsegmentation/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py \
+    --work-dir=${cur_path}/output/$ASCEND_DEVICE_ID \
+    --options data_root=${data_path} data.train.data_root=${data_path} data.val.data_root=${data_path} data.test.data_root=${data_path} \
+    runner.max_iters=500 checkpoint_config.interval=500 evaluation.interval=500 log_config.interval=50 \
+    > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+FPS=`grep -a ' time: '  ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F " time: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{mean_time = $1} END {print '$RANK_SIZE'*'$batch_size'/mean_time}'`
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+mIoU=`grep -a 'mIoU: ' ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F "mIoU: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'`
+#打印，不需要修改
+echo "Final mIoU : ${mIoU}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${RANK_SIZE}'*'${FPS}'}'`
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'Iter ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep eta:|awk -F "loss: " '{print $NF}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "mIoU = ${mIoU}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..15aacf16a45e0c8c78a60a7314861ced005b9c80
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/test/train_performance_8p.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+######################基础配置参数，需要模型审视修改########################
+# 网络名称
+Network="DPT"
+# 训练batch_size
+batch_size=2
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+# 配置环境变量
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=23333
+
+# 路径参数初始化
+# 配置dataset的路径
+data_path=""
+# 配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+######################指定训练脚本执行路径##############################
+nohup python3 -m torch.distributed.launch --nproc_per_node=8 ./mmsegmentation/tools/train.py ./mmsegmentation/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py \
+    --work-dir=${cur_path}/output/$ASCEND_DEVICE_ID \
+    --options data_root=${data_path} data.train.data_root=${data_path} data.val.data_root=${data_path} data.test.data_root=${data_path} \
+    runner.max_iters=500 checkpoint_config.interval=500 evaluation.interval=500 log_config.interval=50 \
+    > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+FPS=`grep -a ' time: '  ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F " time: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{mean_time = $1} END {print '$RANK_SIZE'*'$batch_size'/mean_time}'`
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+mIoU=`grep -a 'mIoU: ' ${cur_path}/output/$ASCEND_DEVICE_ID/*.log|awk -F "mIoU: " '{print $NF}'|awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'`
+#打印，不需要修改
+echo "Final mIoU : ${mIoU}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${RANK_SIZE}'*'${FPS}'}'`
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'Iter ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep eta:|awk -F "loss: " '{print $NF}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "mIoU = ${mIoU}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/analyze_logs.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b637d236b8eae82b6488caf410ed7ea2ed27b5c1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/analyze_logs.py
@@ -0,0 +1,141 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/open-
+mmlab/mmdetection/blob/master/tools/analysis_tools/analyze_logs.py."""
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            plot_epochs = []
+            plot_iters = []
+            plot_values = []
+            # In some log files exist lines of validation,
+            # `mode` list is used to only collect iter number
+            # of training line.
+            for epoch in epochs:
+                epoch_logs = log_dict[epoch]
+                if metric not in epoch_logs.keys():
+                    continue
+                if metric in ['mIoU', 'mAcc', 'aAcc']:
+                    plot_epochs.append(epoch)
+                    plot_values.append(epoch_logs[metric][0])
+                else:
+                    for idx in range(len(epoch_logs[metric])):
+                        if epoch_logs['mode'][idx] == 'train':
+                            plot_iters.append(epoch_logs['iter'][idx])
+                            plot_values.append(epoch_logs[metric][idx])
+            ax = plt.gca()
+            label = legend[i * num_metrics + j]
+            if metric in ['mIoU', 'mAcc', 'aAcc']:
+                ax.set_xticks(plot_epochs)
+                plt.xlabel('epoch')
+                plt.plot(plot_epochs, plot_values, label=label, marker='o')
+            else:
+                plt.xlabel('iter')
+                plt.plot(plot_iters, plot_values, label=label, linewidth=0.5)
+        plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    parser.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mIoU'],
+        help='the metric that you want to plot')
+    parser.add_argument('--title', type=str, help='title of figure')
+    parser.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser.add_argument('--out', type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+    log_dicts = load_json_logs(json_logs)
+    plot_curve(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/benchmark.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7172579715a623385c536d9a87758f6c387ed1c2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/benchmark.py
@@ -0,0 +1,133 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+import time
+
+import mmcv
+import numpy as np
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+
+from mmseg.datasets import build_dataloader, build_dataset
+from mmseg.models import build_segmentor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMSeg benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--log-interval', type=int, default=50, help='interval of logging')
+    parser.add_argument(
+        '--work-dir',
+        help=('if specified, the results will be dumped '
+              'into the directory as json'))
+    parser.add_argument('--repeat-times', type=int, default=1)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    if args.work_dir is not None:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        json_file = osp.join(args.work_dir, f'fps_{timestamp}.json')
+    else:
+        # use config filename as default work_dir if cfg.work_dir is None
+        work_dir = osp.join('./work_dirs',
+                            osp.splitext(osp.basename(args.config))[0])
+        mmcv.mkdir_or_exist(osp.abspath(work_dir))
+        json_file = osp.join(work_dir, f'fps_{timestamp}.json')
+
+    repeat_times = args.repeat_times
+    # set cudnn_benchmark
+    torch.backends.cudnn.benchmark = False
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    benchmark_dict = dict(config=args.config, unit='img / s')
+    overall_fps_list = []
+    for time_index in range(repeat_times):
+        print(f'Run {time_index + 1}:')
+        # build the dataloader
+        # TODO: support multiple images per gpu (only minor changes are needed)
+        dataset = build_dataset(cfg.data.test)
+        data_loader = build_dataloader(
+            dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=False,
+            shuffle=False)
+
+        # build the model and load checkpoint
+        cfg.model.train_cfg = None
+        model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg'))
+        fp16_cfg = cfg.get('fp16', None)
+        if fp16_cfg is not None:
+            wrap_fp16_model(model)
+        if 'checkpoint' in args and osp.exists(args.checkpoint):
+            load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+        model = MMDataParallel(model, device_ids=[0])
+
+        model.eval()
+
+        # the first several iterations may be very slow so skip them
+        num_warmup = 5
+        pure_inf_time = 0
+        total_iters = 200
+
+        # benchmark with 200 image and take the average
+        for i, data in enumerate(data_loader):
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                model(return_loss=False, rescale=True, **data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % args.log_interval == 0:
+                    fps = (i + 1 - num_warmup) / pure_inf_time
+                    print(f'Done image [{i + 1:<3}/ {total_iters}], '
+                          f'fps: {fps:.2f} img / s')
+
+            if (i + 1) == total_iters:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Overall fps: {fps:.2f} img / s\n')
+                benchmark_dict[f'overall_fps_{time_index + 1}'] = round(fps, 2)
+                overall_fps_list.append(fps)
+                break
+    benchmark_dict['average_fps'] = round(np.mean(overall_fps_list), 2)
+    benchmark_dict['fps_variance'] = round(np.var(overall_fps_list), 4)
+    print(f'Average fps of {repeat_times} evaluations: '
+          f'{benchmark_dict["average_fps"]}')
+    print(f'The variance of {repeat_times} evaluations: '
+          f'{benchmark_dict["fps_variance"]}')
+    mmcv.dump(benchmark_dict, json_file, indent=4)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/browse_dataset.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a0bfa50b411a689c21ec08bb93414c047d2d45d
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/browse_dataset.py
@@ -0,0 +1,195 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import warnings
+from pathlib import Path
+
+import mmcv
+import numpy as np
+from mmcv import Config, DictAction
+
+from mmseg.datasets.builder import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--show-origin',
+        default=False,
+        action='store_true',
+        help='if True, omit all augmentation in pipeline,'
+        ' show origin image and seg map')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['DefaultFormatBundle', 'Normalize', 'Collect'],
+        help='skip some useless pipeline，if `show-origin` is true, '
+        'all pipeline except `Load` will be skipped')
+    parser.add_argument(
+        '--output-dir',
+        default='./output',
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=int,
+        default=999,
+        help='the interval of show (ms)')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='the opacity of semantic map')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def imshow_semantic(img,
+                    seg,
+                    class_names,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+    """Draw `result` over `img`.
+
+    Args:
+        img (str or Tensor): The image to be displayed.
+        seg (Tensor): The semantic segmentation results to draw over
+            `img`.
+        class_names (list[str]): Names of each classes.
+        palette (list[list[int]]] | np.ndarray | None): The palette of
+            segmentation map. If None is given, random palette will be
+            generated. Default: None
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+            Default: 0.
+        show (bool): Whether to show the image.
+            Default: False.
+        out_file (str or None): The filename to write the image.
+            Default: None.
+        opacity(float): Opacity of painted segmentation map.
+            Default 0.5.
+            Must be in (0, 1] range.
+    Returns:
+        img (Tensor): Only if not `show` or `out_file`
+    """
+    img = mmcv.imread(img)
+    img = img.copy()
+    if palette is None:
+        palette = np.random.randint(0, 255, size=(len(class_names), 3))
+    palette = np.array(palette)
+    assert palette.shape[0] == len(class_names)
+    assert palette.shape[1] == 3
+    assert len(palette.shape) == 2
+    assert 0 < opacity <= 1.0
+    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+    for label, color in enumerate(palette):
+        color_seg[seg == label, :] = color
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+
+    img = img * (1 - opacity) + color_seg * opacity
+    img = img.astype(np.uint8)
+    # if out_file specified, do not show image in window
+    if out_file is not None:
+        show = False
+
+    if show:
+        mmcv.imshow(img, win_name, wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+
+    if not (show or out_file):
+        warnings.warn('show==False and out_file is not specified, only '
+                      'result image will be returned')
+        return img
+
+
+def _retrieve_data_cfg(_data_cfg, skip_type, show_origin):
+    if show_origin is True:
+        # only keep pipeline of Loading data and ann
+        _data_cfg['pipeline'] = [
+            x for x in _data_cfg.pipeline if 'Load' in x['type']
+        ]
+    else:
+        _data_cfg['pipeline'] = [
+            x for x in _data_cfg.pipeline if x['type'] not in skip_type
+        ]
+
+
+def retrieve_data_cfg(config_path, skip_type, cfg_options, show_origin=False):
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    train_data_cfg = cfg.data.train
+    if isinstance(train_data_cfg, list):
+        for _data_cfg in train_data_cfg:
+            while 'dataset' in _data_cfg and _data_cfg[
+                    'type'] != 'MultiImageMixDataset':
+                _data_cfg = _data_cfg['dataset']
+            if 'pipeline' in _data_cfg:
+                _retrieve_data_cfg(_data_cfg, skip_type, show_origin)
+            else:
+                raise ValueError
+    else:
+        while 'dataset' in train_data_cfg and train_data_cfg[
+                'type'] != 'MultiImageMixDataset':
+            train_data_cfg = train_data_cfg['dataset']
+        _retrieve_data_cfg(train_data_cfg, skip_type, show_origin)
+    return cfg
+
+
+def main():
+    args = parse_args()
+    cfg = retrieve_data_cfg(args.config, args.skip_type, args.cfg_options,
+                            args.show_origin)
+    dataset = build_dataset(cfg.data.train)
+    progress_bar = mmcv.ProgressBar(len(dataset))
+    for item in dataset:
+        filename = os.path.join(args.output_dir,
+                                Path(item['filename']).name
+                                ) if args.output_dir is not None else None
+        imshow_semantic(
+            item['img'],
+            item['gt_semantic_seg'],
+            dataset.CLASSES,
+            dataset.PALETTE,
+            show=args.show,
+            wait_time=args.show_interval,
+            out_file=filename,
+            opacity=args.opacity,
+        )
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/confusion_matrix.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/confusion_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..be372cd97e48274a7a1d69c7facbd2c1e88595a6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/confusion_matrix.py
@@ -0,0 +1,200 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.ticker import MultipleLocator
+from mmcv import Config, DictAction
+
+from mmseg.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate confusion matrix from segmentation results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test .pkl result')
+    parser.add_argument(
+        'save_dir', help='directory where confusion matrix will be saved')
+    parser.add_argument(
+        '--show', action='store_true', help='show confusion matrix')
+    parser.add_argument(
+        '--color-theme',
+        default='winter',
+        help='theme of the matrix color map')
+    parser.add_argument(
+        '--title',
+        default='Normalized Confusion Matrix',
+        help='title of the matrix color map')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def calculate_confusion_matrix(dataset, results):
+    """Calculate the confusion matrix.
+
+    Args:
+        dataset (Dataset): Test or val dataset.
+        results (list[ndarray]): A list of segmentation results in each image.
+    """
+    n = len(dataset.CLASSES)
+    confusion_matrix = np.zeros(shape=[n, n])
+    assert len(dataset) == len(results)
+    ignore_index = dataset.ignore_index
+    prog_bar = mmcv.ProgressBar(len(results))
+    for idx, per_img_res in enumerate(results):
+        res_segm = per_img_res
+        gt_segm = dataset.get_gt_seg_map_by_idx(idx).astype(int)
+        gt_segm, res_segm = gt_segm.flatten(), res_segm.flatten()
+        to_ignore = gt_segm == ignore_index
+        gt_segm, res_segm = gt_segm[~to_ignore], res_segm[~to_ignore]
+        inds = n * gt_segm + res_segm
+        mat = np.bincount(inds, minlength=n**2).reshape(n, n)
+        confusion_matrix += mat
+        prog_bar.update()
+    return confusion_matrix
+
+
+def plot_confusion_matrix(confusion_matrix,
+                          labels,
+                          save_dir=None,
+                          show=True,
+                          title='Normalized Confusion Matrix',
+                          color_theme='winter'):
+    """Draw confusion matrix with matplotlib.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix.
+        labels (list[str]): List of class names.
+        save_dir (str|optional): If set, save the confusion matrix plot to the
+            given path. Default: None.
+        show (bool): Whether to show the plot. Default: True.
+        title (str): Title of the plot. Default: `Normalized Confusion Matrix`.
+        color_theme (str): Theme of the matrix color map. Default: `winter`.
+    """
+    # normalize the confusion matrix
+    per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis]
+    confusion_matrix = \
+        confusion_matrix.astype(np.float32) / per_label_sums * 100
+
+    num_classes = len(labels)
+    fig, ax = plt.subplots(
+        figsize=(2 * num_classes, 2 * num_classes * 0.8), dpi=180)
+    cmap = plt.get_cmap(color_theme)
+    im = ax.imshow(confusion_matrix, cmap=cmap)
+    plt.colorbar(mappable=im, ax=ax)
+
+    title_font = {'weight': 'bold', 'size': 12}
+    ax.set_title(title, fontdict=title_font)
+    label_font = {'size': 10}
+    plt.ylabel('Ground Truth Label', fontdict=label_font)
+    plt.xlabel('Prediction Label', fontdict=label_font)
+
+    # draw locator
+    xmajor_locator = MultipleLocator(1)
+    xminor_locator = MultipleLocator(0.5)
+    ax.xaxis.set_major_locator(xmajor_locator)
+    ax.xaxis.set_minor_locator(xminor_locator)
+    ymajor_locator = MultipleLocator(1)
+    yminor_locator = MultipleLocator(0.5)
+    ax.yaxis.set_major_locator(ymajor_locator)
+    ax.yaxis.set_minor_locator(yminor_locator)
+
+    # draw grid
+    ax.grid(True, which='minor', linestyle='-')
+
+    # draw label
+    ax.set_xticks(np.arange(num_classes))
+    ax.set_yticks(np.arange(num_classes))
+    ax.set_xticklabels(labels)
+    ax.set_yticklabels(labels)
+
+    ax.tick_params(
+        axis='x', bottom=False, top=True, labelbottom=False, labeltop=True)
+    plt.setp(
+        ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')
+
+    # draw confusion matrix value
+    for i in range(num_classes):
+        for j in range(num_classes):
+            ax.text(
+                j,
+                i,
+                '{}%'.format(
+                    round(confusion_matrix[i, j], 2
+                          ) if not np.isnan(confusion_matrix[i, j]) else -1),
+                ha='center',
+                va='center',
+                color='w',
+                size=7)
+
+    ax.set_ylim(len(confusion_matrix) - 0.5, -0.5)  # matplotlib>3.1.1
+
+    fig.tight_layout()
+    if save_dir is not None:
+        plt.savefig(
+            os.path.join(save_dir, 'confusion_matrix.png'), format='png')
+    if show:
+        plt.show()
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    results = mmcv.load(args.prediction_path)
+
+    assert isinstance(results, list)
+    if isinstance(results[0], np.ndarray):
+        pass
+    else:
+        raise TypeError('invalid type of prediction results')
+
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+
+    dataset = build_dataset(cfg.data.test)
+    confusion_matrix = calculate_confusion_matrix(dataset, results)
+    plot_confusion_matrix(
+        confusion_matrix,
+        dataset.CLASSES,
+        save_dir=args.save_dir,
+        show=args.show,
+        title=args.title,
+        color_theme=args.color_theme)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/chase_db1.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b6302cc38088cfd7adce9b0fa8dd14fc784e0e
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/chase_db1.py
@@ -0,0 +1,101 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+
+CHASE_DB1_LEN = 28 * 3
+TRAINING_LEN = 60
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert CHASE_DB1 dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='path of CHASEDB1.zip')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'CHASE_DB1')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(out_dir)
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        print('Extracting CHASEDB1.zip...')
+        zip_file = zipfile.ZipFile(dataset_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating training dataset...')
+
+        assert len(os.listdir(tmp_dir)) == CHASE_DB1_LEN, \
+            'len(os.listdir(tmp_dir)) != {}'.format(CHASE_DB1_LEN)
+
+        for img_name in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(tmp_dir, img_name))
+            if osp.splitext(img_name)[1] == '.jpg':
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'training',
+                             osp.splitext(img_name)[0] + '.png'))
+            else:
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'training',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        for img_name in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(tmp_dir, img_name))
+            if osp.splitext(img_name)[1] == '.jpg':
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+            else:
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/cityscapes.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c01570e16dd0e1343759a1e93457fd12ba4ce7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/cityscapes.py
@@ -0,0 +1,69 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+
+import mmcv
+from cityscapesscripts.preparation.json2labelImg import json2labelImg
+
+
+def convert_json_to_label(json_file):
+    label_file = json_file.replace('_polygons.json', '_labelTrainIds.png')
+    json2labelImg(json_file, label_file, 'trainIds')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to TrainIds')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mmcv.mkdir_or_exist(out_dir)
+
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    poly_files = []
+    for poly in mmcv.scandir(gt_dir, '_polygons.json', recursive=True):
+        poly_file = osp.join(gt_dir, poly)
+        poly_files.append(poly_file)
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(convert_json_to_label, poly_files,
+                                     args.nproc)
+    else:
+        mmcv.track_progress(convert_json_to_label, poly_files)
+
+    split_names = ['train', 'val', 'test']
+
+    for split in split_names:
+        filenames = []
+        for poly in mmcv.scandir(
+                osp.join(gt_dir, split), '_polygons.json', recursive=True):
+            filenames.append(poly.replace('_gtFine_polygons.json', ''))
+        with open(osp.join(out_dir, f'{split}.txt'), 'w') as f:
+            f.writelines(f + '\n' for f in filenames)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff10k.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..901cb3597cd7f3bbd9263a97fc8f8b096f641faa
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff10k.py
@@ -0,0 +1,320 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+
+import mmcv
+import numpy as np
+from PIL import Image
+from scipy.io import loadmat
+
+COCO_LEN = 10000
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    11: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    25: 24,
+    27: 25,
+    28: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    44: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    65: 60,
+    67: 61,
+    70: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    82: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    90: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    182: 171
+}
+
+
+def convert_to_trainID(tuple_path, in_img_dir, in_ann_dir, out_img_dir,
+                       out_mask_dir, is_train):
+    imgpath, maskpath = tuple_path
+    shutil.copyfile(
+        osp.join(in_img_dir, imgpath),
+        osp.join(out_img_dir, 'train2014', imgpath) if is_train else osp.join(
+            out_img_dir, 'test2014', imgpath))
+    annotate = loadmat(osp.join(in_ann_dir, maskpath))
+    mask = annotate['S'].astype(np.uint8)
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(out_mask_dir, 'train2014',
+                            maskpath.split('.')[0] +
+                            '_labelTrainIds.png') if is_train else osp.join(
+                                out_mask_dir, 'test2014',
+                                maskpath.split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def generate_coco_list(folder):
+    train_list = osp.join(folder, 'imageLists', 'train.txt')
+    test_list = osp.join(folder, 'imageLists', 'test.txt')
+    train_paths = []
+    test_paths = []
+
+    with open(train_list) as f:
+        for filename in f:
+            basename = filename.strip()
+            imgpath = basename + '.jpg'
+            maskpath = basename + '.mat'
+            train_paths.append((imgpath, maskpath))
+
+    with open(test_list) as f:
+        for filename in f:
+            basename = filename.strip()
+            imgpath = basename + '.jpg'
+            maskpath = basename + '.mat'
+            test_paths.append((imgpath, maskpath))
+
+    return train_paths, test_paths
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 10k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mmcv.mkdir_or_exist(osp.join(out_img_dir, 'train2014'))
+    mmcv.mkdir_or_exist(osp.join(out_img_dir, 'test2014'))
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2014'))
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'test2014'))
+
+    train_list, test_list = generate_coco_list(coco_path)
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=True),
+            train_list,
+            nproc=nproc)
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=True), train_list)
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=False), test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff164k.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff164k.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e68f3c6ce1eab4900fb46e895c6a114c0af5cee
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/coco_stuff164k.py
@@ -0,0 +1,277 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+from glob import glob
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+COCO_LEN = 123287
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255
+}
+
+
+def convert_to_trainID(maskpath, out_mask_dir, is_train):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(
+        out_mask_dir, 'train2017',
+        osp.basename(maskpath).split('.')[0] +
+        '_labelTrainIds.png') if is_train else osp.join(
+            out_mask_dir, 'val2017',
+            osp.basename(maskpath).split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
+
+    if out_dir != coco_path:
+        shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
+
+    train_list = glob(osp.join(coco_path, 'annotations', 'train2017', '*.png'))
+    train_list = [file for file in train_list if '_labelTrainIds' not in file]
+    test_list = glob(osp.join(coco_path, 'annotations', 'val2017', '*.png'))
+    test_list = [file for file in test_list if '_labelTrainIds' not in file]
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list,
+            nproc=nproc)
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list)
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/drive.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..60202a7109be3f985abb0f3c1ebdf4a1cabaf4e6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/drive.py
@@ -0,0 +1,126 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import cv2
+import mmcv
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert DRIVE dataset to mmsegmentation format')
+    parser.add_argument(
+        'training_path', help='the training part of DRIVE dataset')
+    parser.add_argument(
+        'testing_path', help='the testing part of DRIVE dataset')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    training_path = args.training_path
+    testing_path = args.testing_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'DRIVE')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(out_dir)
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        print('Extracting training.zip...')
+        zip_file = zipfile.ZipFile(training_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating training dataset...')
+        now_dir = osp.join(tmp_dir, 'training', 'images')
+        for img_name in os.listdir(now_dir):
+            img = mmcv.imread(osp.join(now_dir, img_name))
+            mmcv.imwrite(
+                img,
+                osp.join(
+                    out_dir, 'images', 'training',
+                    osp.splitext(img_name)[0].replace('_training', '') +
+                    '.png'))
+
+        now_dir = osp.join(tmp_dir, 'training', '1st_manual')
+        for img_name in os.listdir(now_dir):
+            cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+            ret, img = cap.read()
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(img_name)[0] + '.png'))
+
+        print('Extracting test.zip...')
+        zip_file = zipfile.ZipFile(testing_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating validation dataset...')
+        now_dir = osp.join(tmp_dir, 'test', 'images')
+        for img_name in os.listdir(now_dir):
+            img = mmcv.imread(osp.join(now_dir, img_name))
+            mmcv.imwrite(
+                img,
+                osp.join(
+                    out_dir, 'images', 'validation',
+                    osp.splitext(img_name)[0].replace('_test', '') + '.png'))
+
+        now_dir = osp.join(tmp_dir, 'test', '1st_manual')
+        if osp.exists(now_dir):
+            for img_name in os.listdir(now_dir):
+                cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+                ret, img = cap.read()
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        now_dir = osp.join(tmp_dir, 'test', '2nd_manual')
+        if osp.exists(now_dir):
+            for img_name in os.listdir(now_dir):
+                cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+                ret, img = cap.read()
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/hrf.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8b938993f1543b2908c6e7b4b7dd655cebced2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/hrf.py
@@ -0,0 +1,124 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+
+HRF_LEN = 15
+TRAINING_LEN = 5
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert HRF dataset to mmsegmentation format')
+    parser.add_argument('healthy_path', help='the path of healthy.zip')
+    parser.add_argument(
+        'healthy_manualsegm_path', help='the path of healthy_manualsegm.zip')
+    parser.add_argument('glaucoma_path', help='the path of glaucoma.zip')
+    parser.add_argument(
+        'glaucoma_manualsegm_path', help='the path of glaucoma_manualsegm.zip')
+    parser.add_argument(
+        'diabetic_retinopathy_path',
+        help='the path of diabetic_retinopathy.zip')
+    parser.add_argument(
+        'diabetic_retinopathy_manualsegm_path',
+        help='the path of diabetic_retinopathy_manualsegm.zip')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    images_path = [
+        args.healthy_path, args.glaucoma_path, args.diabetic_retinopathy_path
+    ]
+    annotations_path = [
+        args.healthy_manualsegm_path, args.glaucoma_manualsegm_path,
+        args.diabetic_retinopathy_manualsegm_path
+    ]
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'HRF')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(out_dir)
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    print('Generating images...')
+    for now_path in images_path:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(now_path)
+            zip_file.extractall(tmp_dir)
+
+            assert len(os.listdir(tmp_dir)) == HRF_LEN, \
+                'len(os.listdir(tmp_dir)) != {}'.format(HRF_LEN)
+
+            for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'training',
+                             osp.splitext(filename)[0] + '.png'))
+            for filename in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'validation',
+                             osp.splitext(filename)[0] + '.png'))
+
+    print('Generating annotations...')
+    for now_path in annotations_path:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(now_path)
+            zip_file.extractall(tmp_dir)
+
+            assert len(os.listdir(tmp_dir)) == HRF_LEN, \
+                'len(os.listdir(tmp_dir)) != {}'.format(HRF_LEN)
+
+            for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'training',
+                             osp.splitext(filename)[0] + '.png'))
+            for filename in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(filename)[0] + '.png'))
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/isaid.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1b9937b5dd190c4d25825681f3830219e7eeab2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/isaid.py
@@ -0,0 +1,258 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import os
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+iSAID_palette = \
+    {
+        0: (0, 0, 0),
+        1: (0, 0, 63),
+        2: (0, 63, 63),
+        3: (0, 63, 0),
+        4: (0, 63, 127),
+        5: (0, 63, 191),
+        6: (0, 63, 255),
+        7: (0, 127, 63),
+        8: (0, 127, 127),
+        9: (0, 0, 127),
+        10: (0, 0, 191),
+        11: (0, 0, 255),
+        12: (0, 191, 127),
+        13: (0, 127, 191),
+        14: (0, 127, 255),
+        15: (0, 100, 155)
+    }
+
+iSAID_invert_palette = {v: k for k, v in iSAID_palette.items()}
+
+
+def iSAID_convert_from_color(arr_3d, palette=iSAID_invert_palette):
+    """RGB-color encoding to grayscale labels."""
+    arr_2d = np.zeros((arr_3d.shape[0], arr_3d.shape[1]), dtype=np.uint8)
+
+    for c, i in palette.items():
+        m = np.all(arr_3d == np.array(c).reshape(1, 1, 3), axis=2)
+        arr_2d[m] = i
+
+    return arr_2d
+
+
+def slide_crop_image(src_path, out_dir, mode, patch_H, patch_W, overlap):
+    img = np.asarray(Image.open(src_path).convert('RGB'))
+
+    img_H, img_W, _ = img.shape
+
+    if img_H < patch_H and img_W > patch_W:
+
+        img = mmcv.impad(img, shape=(patch_H, img_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    elif img_H > patch_H and img_W < patch_W:
+
+        img = mmcv.impad(img, shape=(img_H, patch_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    elif img_H < patch_H and img_W < patch_W:
+
+        img = mmcv.impad(img, shape=(patch_H, patch_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    for x in range(0, img_W, patch_W - overlap):
+        for y in range(0, img_H, patch_H - overlap):
+            x_str = x
+            x_end = x + patch_W
+            if x_end > img_W:
+                diff_x = x_end - img_W
+                x_str -= diff_x
+                x_end = img_W
+            y_str = y
+            y_end = y + patch_H
+            if y_end > img_H:
+                diff_y = y_end - img_H
+                y_str -= diff_y
+                y_end = img_H
+
+            img_patch = img[y_str:y_end, x_str:x_end, :]
+            img_patch = Image.fromarray(img_patch.astype(np.uint8))
+            image = osp.basename(src_path).split('.')[0] + '_' + str(
+                y_str) + '_' + str(y_end) + '_' + str(x_str) + '_' + str(
+                    x_end) + '.png'
+            # print(image)
+            save_path_image = osp.join(out_dir, 'img_dir', mode, str(image))
+            img_patch.save(save_path_image)
+
+
+def slide_crop_label(src_path, out_dir, mode, patch_H, patch_W, overlap):
+    label = mmcv.imread(src_path, channel_order='rgb')
+    label = iSAID_convert_from_color(label)
+    img_H, img_W = label.shape
+
+    if img_H < patch_H and img_W > patch_W:
+
+        label = mmcv.impad(label, shape=(patch_H, img_W), pad_val=255)
+
+        img_H = patch_H
+
+    elif img_H > patch_H and img_W < patch_W:
+
+        label = mmcv.impad(label, shape=(img_H, patch_W), pad_val=255)
+
+        img_W = patch_W
+
+    elif img_H < patch_H and img_W < patch_W:
+
+        label = mmcv.impad(label, shape=(patch_H, patch_W), pad_val=255)
+
+        img_H = patch_H
+        img_W = patch_W
+
+    for x in range(0, img_W, patch_W - overlap):
+        for y in range(0, img_H, patch_H - overlap):
+            x_str = x
+            x_end = x + patch_W
+            if x_end > img_W:
+                diff_x = x_end - img_W
+                x_str -= diff_x
+                x_end = img_W
+            y_str = y
+            y_end = y + patch_H
+            if y_end > img_H:
+                diff_y = y_end - img_H
+                y_str -= diff_y
+                y_end = img_H
+
+            lab_patch = label[y_str:y_end, x_str:x_end]
+            lab_patch = Image.fromarray(lab_patch.astype(np.uint8), mode='P')
+
+            image = osp.basename(src_path).split('.')[0].split(
+                '_')[0] + '_' + str(y_str) + '_' + str(y_end) + '_' + str(
+                    x_str) + '_' + str(x_end) + '_instance_color_RGB' + '.png'
+            lab_patch.save(osp.join(out_dir, 'ann_dir', mode, str(image)))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert iSAID dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='iSAID folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+
+    parser.add_argument(
+        '--patch_width',
+        default=896,
+        type=int,
+        help='Width of the cropped image patch')
+    parser.add_argument(
+        '--patch_height',
+        default=896,
+        type=int,
+        help='Height of the cropped image patch')
+    parser.add_argument(
+        '--overlap_area', default=384, type=int, help='Overlap area')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    # image patch width and height
+    patch_H, patch_W = args.patch_width, args.patch_height
+
+    overlap = args.overlap_area  # overlap area
+
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'iSAID')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'test'))
+
+    assert os.path.exists(os.path.join(dataset_path, 'train')), \
+        'train is not in {}'.format(dataset_path)
+    assert os.path.exists(os.path.join(dataset_path, 'val')), \
+        'val is not in {}'.format(dataset_path)
+    assert os.path.exists(os.path.join(dataset_path, 'test')), \
+        'test is not in {}'.format(dataset_path)
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for dataset_mode in ['train', 'val', 'test']:
+
+            # for dataset_mode in [ 'test']:
+            print('Extracting  {}ing.zip...'.format(dataset_mode))
+            img_zipp_list = glob.glob(
+                os.path.join(dataset_path, dataset_mode, 'images', '*.zip'))
+            print('Find the data', img_zipp_list)
+            for img_zipp in img_zipp_list:
+                zip_file = zipfile.ZipFile(img_zipp)
+                zip_file.extractall(os.path.join(tmp_dir, dataset_mode, 'img'))
+            src_path_list = glob.glob(
+                os.path.join(tmp_dir, dataset_mode, 'img', 'images', '*.png'))
+
+            src_prog_bar = mmcv.ProgressBar(len(src_path_list))
+            for i, img_path in enumerate(src_path_list):
+                if dataset_mode != 'test':
+                    slide_crop_image(img_path, out_dir, dataset_mode, patch_H,
+                                     patch_W, overlap)
+
+                else:
+                    shutil.move(img_path,
+                                os.path.join(out_dir, 'img_dir', dataset_mode))
+                src_prog_bar.update()
+
+            if dataset_mode != 'test':
+                label_zipp_list = glob.glob(
+                    os.path.join(dataset_path, dataset_mode, 'Semantic_masks',
+                                 '*.zip'))
+                for label_zipp in label_zipp_list:
+                    zip_file = zipfile.ZipFile(label_zipp)
+                    zip_file.extractall(
+                        os.path.join(tmp_dir, dataset_mode, 'lab'))
+
+                lab_path_list = glob.glob(
+                    os.path.join(tmp_dir, dataset_mode, 'lab', 'images',
+                                 '*.png'))
+                lab_prog_bar = mmcv.ProgressBar(len(lab_path_list))
+                for i, lab_path in enumerate(lab_path_list):
+                    slide_crop_label(lab_path, out_dir, dataset_mode, patch_H,
+                                     patch_W, overlap)
+                    lab_prog_bar.update()
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/loveda.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b2eee85ac6cebf51feeaa9cec2388b31e6ff29
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/loveda.py
@@ -0,0 +1,86 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+import mmcv
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert LoveDA dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='LoveDA folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'loveDA')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(out_dir)
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    assert 'Train.zip' in os.listdir(dataset_path), \
+        'Train.zip is not in {}'.format(dataset_path)
+    assert 'Val.zip' in os.listdir(dataset_path), \
+        'Val.zip is not in {}'.format(dataset_path)
+    assert 'Test.zip' in os.listdir(dataset_path), \
+        'Test.zip is not in {}'.format(dataset_path)
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for dataset in ['Train', 'Val', 'Test']:
+            zip_file = zipfile.ZipFile(
+                os.path.join(dataset_path, dataset + '.zip'))
+            zip_file.extractall(tmp_dir)
+            data_type = dataset.lower()
+            for location in ['Rural', 'Urban']:
+                for image_type in ['images_png', 'masks_png']:
+                    if image_type == 'images_png':
+                        dst = osp.join(out_dir, 'img_dir', data_type)
+                    else:
+                        dst = osp.join(out_dir, 'ann_dir', data_type)
+                    if dataset == 'Test' and image_type == 'masks_png':
+                        continue
+                    else:
+                        src_dir = osp.join(tmp_dir, dataset, location,
+                                           image_type)
+                        src_lst = os.listdir(src_dir)
+                        for file in src_lst:
+                            shutil.move(osp.join(src_dir, file), dst)
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/pascal_context.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9b28e1dc96c97418607700d790450de50ac32f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/pascal_context.py
@@ -0,0 +1,100 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+from functools import partial
+
+import mmcv
+import numpy as np
+from detail import Detail
+from PIL import Image
+
+_mapping = np.sort(
+    np.array([
+        0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284,
+        158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59,
+        440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355,
+        85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115
+    ]))
+_key = np.array(range(len(_mapping))).astype('uint8')
+
+
+def generate_labels(img_id, detail, out_dir):
+
+    def _class_to_index(mask, _mapping, _key):
+        # assert the values
+        values = np.unique(mask)
+        for i in range(len(values)):
+            assert (values[i] in _mapping)
+        index = np.digitize(mask.ravel(), _mapping, right=True)
+        return _key[index].reshape(mask.shape)
+
+    mask = Image.fromarray(
+        _class_to_index(detail.getMask(img_id), _mapping=_mapping, _key=_key))
+    filename = img_id['file_name']
+    mask.save(osp.join(out_dir, filename.replace('jpg', 'png')))
+    return osp.splitext(osp.basename(filename))[0]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmsegmentation format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('json_path', help='annoation json filepath')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    if args.out_dir is None:
+        out_dir = osp.join(devkit_path, 'VOC2010', 'SegmentationClassContext')
+    else:
+        out_dir = args.out_dir
+    json_path = args.json_path
+    mmcv.mkdir_or_exist(out_dir)
+    img_dir = osp.join(devkit_path, 'VOC2010', 'JPEGImages')
+
+    train_detail = Detail(json_path, img_dir, 'train')
+    train_ids = train_detail.getImgs()
+
+    val_detail = Detail(json_path, img_dir, 'val')
+    val_ids = val_detail.getImgs()
+
+    mmcv.mkdir_or_exist(
+        osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext'))
+
+    train_list = mmcv.track_progress(
+        partial(generate_labels, detail=train_detail, out_dir=out_dir),
+        train_ids)
+    with open(
+            osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext',
+                     'train.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(train_list))
+
+    val_list = mmcv.track_progress(
+        partial(generate_labels, detail=val_detail, out_dir=out_dir), val_ids)
+    with open(
+            osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext',
+                     'val.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(val_list))
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/potsdam.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5a85f72d095a525e2ecf2a03315d1fbafb04eef
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/potsdam.py
@@ -0,0 +1,170 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert potsdam dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='potsdam folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=512)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    # Original image of Potsdam dataset is very large, thus pre-processing
+    # of them is adopted. Given fixed clip size and stride size to generate
+    # clipped image, the intersection　of width and height is determined.
+    # For example, given one 5120 x 5120 original image, the clip size is
+    # 512 and stride size is 256, thus it would generate 20x20 = 400 images
+    # whose size are all 512x512.
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        color_map = np.array([[0, 0, 0], [255, 255, 255], [255, 0, 0],
+                              [255, 255, 0], [0, 255, 0], [0, 255, 255],
+                              [0, 0, 255]])
+        flatten_v = np.matmul(
+            image.reshape(-1, c),
+            np.array([2, 3, 4]).reshape(3, 1))
+        out = np.zeros_like(flatten_v)
+        for idx, class_color in enumerate(color_map):
+            value_idx = np.matmul(class_color,
+                                  np.array([2, 3, 4]).reshape(3, 1))
+            out[flatten_v == value_idx] = idx
+        image = out.reshape(h, w)
+
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        idx_i, idx_j = osp.basename(image_path).split('_')[2:4]
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(
+                clip_save_dir,
+                f'{idx_i}_{idx_j}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+def main():
+    args = parse_args()
+    splits = {
+        'train': [
+            '2_10', '2_11', '2_12', '3_10', '3_11', '3_12', '4_10', '4_11',
+            '4_12', '5_10', '5_11', '5_12', '6_10', '6_11', '6_12', '6_7',
+            '6_8', '6_9', '7_10', '7_11', '7_12', '7_7', '7_8', '7_9'
+        ],
+        'val': [
+            '5_15', '6_15', '6_13', '3_13', '4_14', '6_14', '5_14', '2_13',
+            '4_15', '2_14', '5_13', '4_13', '3_14', '7_13'
+        ]
+    }
+
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'potsdam')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
+    print('Find the data', zipp_list)
+
+    for zipp in zipp_list:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(zipp)
+            zip_file.extractall(tmp_dir)
+            src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+            if not len(src_path_list):
+                sub_tmp_dir = os.path.join(tmp_dir, os.listdir(tmp_dir)[0])
+                src_path_list = glob.glob(os.path.join(sub_tmp_dir, '*.tif'))
+
+            prog_bar = mmcv.ProgressBar(len(src_path_list))
+            for i, src_path in enumerate(src_path_list):
+                idx_i, idx_j = osp.basename(src_path).split('_')[2:4]
+                data_type = 'train' if f'{idx_i}_{idx_j}' in splits[
+                    'train'] else 'val'
+                if 'label' in src_path:
+                    dst_dir = osp.join(out_dir, 'ann_dir', data_type)
+                    clip_big_image(src_path, dst_dir, args, to_label=True)
+                else:
+                    dst_dir = osp.join(out_dir, 'img_dir', data_type)
+                    clip_big_image(src_path, dst_dir, args, to_label=False)
+                prog_bar.update()
+
+    print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/stare.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..3573785724a99c51748b8fba15b87b857a9b82b7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/stare.py
@@ -0,0 +1,179 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gzip
+import os
+import os.path as osp
+import tarfile
+import tempfile
+
+import mmcv
+
+STARE_LEN = 20
+TRAINING_LEN = 10
+
+
+def un_gz(src, dst):
+    g_file = gzip.GzipFile(src)
+    with open(dst, 'wb+') as f:
+        f.write(g_file.read())
+    g_file.close()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert STARE dataset to mmsegmentation format')
+    parser.add_argument('image_path', help='the path of stare-images.tar')
+    parser.add_argument('labels_ah', help='the path of labels-ah.tar')
+    parser.add_argument('labels_vk', help='the path of labels-vk.tar')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    image_path = args.image_path
+    labels_ah = args.labels_ah
+    labels_vk = args.labels_vk
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'STARE')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(out_dir)
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting stare-images.tar...')
+        with tarfile.open(image_path) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img,
+                osp.join(out_dir, 'images', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img,
+                osp.join(out_dir, 'images', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting labels-ah.tar...')
+        with tarfile.open(labels_ah) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            # The annotation img should be divided by 128, because some of
+            # the annotation imgs are not standard. We should set a threshold
+            # to convert the nonstandard annotation imgs. The value divided by
+            # 128 equivalent to '1 if value >= 128 else 0'
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting labels-vk.tar...')
+        with tarfile.open(labels_vk) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/vaihingen.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/vaihingen.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e0e1c2ee0f1ddc82ab60dda03545ccede06bf76
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/vaihingen.py
@@ -0,0 +1,168 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert vaihingen dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='vaihingen folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=512)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def clip_big_image(image_path, clip_save_dir, to_label=False):
+    # Original image of Vaihingen dataset is very large, thus pre-processing
+    # of them is adopted. Given fixed clip size and stride size to generate
+    # clipped image, the intersection　of width and height is determined.
+    # For example, given one 5120 x 5120 original image, the clip size is
+    # 512 and stride size is 256, thus it would generate 20x20 = 400 images
+    # whose size are all 512x512.
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    cs = args.clip_size
+    ss = args.stride_size
+
+    num_rows = math.ceil((h - cs) / ss) if math.ceil(
+        (h - cs) / ss) * ss + cs >= h else math.ceil((h - cs) / ss) + 1
+    num_cols = math.ceil((w - cs) / ss) if math.ceil(
+        (w - cs) / ss) * ss + cs >= w else math.ceil((w - cs) / ss) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * cs
+    ymin = y * cs
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + cs > w, w - xmin - cs, np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + cs > h, h - ymin - cs, np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + cs, w),
+        np.minimum(ymin + cs, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        color_map = np.array([[0, 0, 0], [255, 255, 255], [255, 0, 0],
+                              [255, 255, 0], [0, 255, 0], [0, 255, 255],
+                              [0, 0, 255]])
+        flatten_v = np.matmul(
+            image.reshape(-1, c),
+            np.array([2, 3, 4]).reshape(3, 1))
+        out = np.zeros_like(flatten_v)
+        for idx, class_color in enumerate(color_map):
+            value_idx = np.matmul(class_color,
+                                  np.array([2, 3, 4]).reshape(3, 1))
+            out[flatten_v == value_idx] = idx
+        image = out.reshape(h, w)
+
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        area_idx = osp.basename(image_path).split('_')[3].strip('.tif')
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(clip_save_dir,
+                     f'{area_idx}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+def main():
+    splits = {
+        'train': [
+            'area1', 'area11', 'area13', 'area15', 'area17', 'area21',
+            'area23', 'area26', 'area28', 'area3', 'area30', 'area32',
+            'area34', 'area37', 'area5', 'area7'
+        ],
+        'val': [
+            'area6', 'area24', 'area35', 'area16', 'area14', 'area22',
+            'area10', 'area4', 'area2', 'area20', 'area8', 'area31', 'area33',
+            'area27', 'area38', 'area12', 'area29'
+        ],
+    }
+
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'vaihingen')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
+    print('Find the data', zipp_list)
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for zipp in zipp_list:
+            zip_file = zipfile.ZipFile(zipp)
+            zip_file.extractall(tmp_dir)
+            src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+            if 'ISPRS_semantic_labeling_Vaihingen' in zipp:
+                src_path_list = glob.glob(
+                    os.path.join(os.path.join(tmp_dir, 'top'), '*.tif'))
+            if 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE' in zipp:  # noqa
+                src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+                # delete unused area9 ground truth
+                for area_ann in src_path_list:
+                    if 'area9' in area_ann:
+                        src_path_list.remove(area_ann)
+            prog_bar = mmcv.ProgressBar(len(src_path_list))
+            for i, src_path in enumerate(src_path_list):
+                area_idx = osp.basename(src_path).split('_')[3].strip('.tif')
+                data_type = 'train' if area_idx in splits['train'] else 'val'
+                if 'noBoundary' in src_path:
+                    dst_dir = osp.join(out_dir, 'ann_dir', data_type)
+                    clip_big_image(src_path, dst_dir, to_label=True)
+                else:
+                    dst_dir = osp.join(out_dir, 'img_dir', data_type)
+                    clip_big_image(src_path, dst_dir, to_label=False)
+                prog_bar.update()
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/voc_aug.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/voc_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa8db4358e8f2f8c324d789ff1ebcd5c670763c
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/convert_datasets/voc_aug.py
@@ -0,0 +1,105 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os.path as osp
+from functools import partial
+
+import mmcv
+import numpy as np
+from PIL import Image
+from scipy.io import loadmat
+
+AUG_LEN = 10582
+
+
+def convert_mat(mat_file, in_dir, out_dir):
+    data = loadmat(osp.join(in_dir, mat_file))
+    mask = data['GTcls'][0]['Segmentation'][0].astype(np.uint8)
+    seg_filename = osp.join(out_dir, mat_file.replace('.mat', '.png'))
+    Image.fromarray(mask).save(seg_filename, 'PNG')
+
+
+def generate_aug_list(merged_list, excluded_list):
+    return list(set(merged_list) - set(excluded_list))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmsegmentation format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('aug_path', help='pascal voc aug path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    aug_path = args.aug_path
+    nproc = args.nproc
+    if args.out_dir is None:
+        out_dir = osp.join(devkit_path, 'VOC2012', 'SegmentationClassAug')
+    else:
+        out_dir = args.out_dir
+    mmcv.mkdir_or_exist(out_dir)
+    in_dir = osp.join(aug_path, 'dataset', 'cls')
+
+    mmcv.track_parallel_progress(
+        partial(convert_mat, in_dir=in_dir, out_dir=out_dir),
+        list(mmcv.scandir(in_dir, suffix='.mat')),
+        nproc=nproc)
+
+    full_aug_list = []
+    with open(osp.join(aug_path, 'dataset', 'train.txt')) as f:
+        full_aug_list += [line.strip() for line in f]
+    with open(osp.join(aug_path, 'dataset', 'val.txt')) as f:
+        full_aug_list += [line.strip() for line in f]
+
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'train.txt')) as f:
+        ori_train_list = [line.strip() for line in f]
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'val.txt')) as f:
+        val_list = [line.strip() for line in f]
+
+    aug_train_list = generate_aug_list(ori_train_list + full_aug_list,
+                                       val_list)
+    assert len(aug_train_list) == AUG_LEN, 'len(aug_train_list) != {}'.format(
+        AUG_LEN)
+
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'trainaug.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in aug_train_list)
+
+    aug_list = generate_aug_list(full_aug_list, ori_train_list + val_list)
+    assert len(aug_list) == AUG_LEN - len(
+        ori_train_list), 'len(aug_list) != {}'.format(AUG_LEN -
+                                                      len(ori_train_list))
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation', 'aug.txt'),
+            'w') as f:
+        f.writelines(line + '\n' for line in aug_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/deploy_test.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/deploy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..786087b5a326d8b4d63776e0d5c99c804eaf910f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/deploy_test.py
@@ -0,0 +1,351 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import shutil
+import warnings
+from typing import Any, Iterable
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import get_dist_info
+from mmcv.utils import DictAction
+
+from mmseg.apis import single_gpu_test
+from mmseg.datasets import build_dataloader, build_dataset
+from mmseg.models.segmentors.base import BaseSegmentor
+from mmseg.ops import resize
+
+
+class ONNXRuntimeSegmentor(BaseSegmentor):
+
+    def __init__(self, onnx_file: str, cfg: Any, device_id: int):
+        super(ONNXRuntimeSegmentor, self).__init__()
+        import onnxruntime as ort
+
+        # get the custom op path
+        ort_custom_op_path = ''
+        try:
+            from mmcv.ops import get_onnxruntime_op_path
+            ort_custom_op_path = get_onnxruntime_op_path()
+        except (ImportError, ModuleNotFoundError):
+            warnings.warn('If input model has custom op from mmcv, \
+                you may have to build mmcv with ONNXRuntime from source.')
+        session_options = ort.SessionOptions()
+        # register custom op for onnxruntime
+        if osp.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+        sess = ort.InferenceSession(onnx_file, session_options)
+        providers = ['CPUExecutionProvider']
+        options = [{}]
+        is_cuda_available = ort.get_device() == 'GPU'
+        if is_cuda_available:
+            providers.insert(0, 'CUDAExecutionProvider')
+            options.insert(0, {'device_id': device_id})
+
+        sess.set_providers(providers, options)
+
+        self.sess = sess
+        self.device_id = device_id
+        self.io_binding = sess.io_binding()
+        self.output_names = [_.name for _ in sess.get_outputs()]
+        for name in self.output_names:
+            self.io_binding.bind_output(name)
+        self.cfg = cfg
+        self.test_mode = cfg.model.test_cfg.mode
+        self.is_cuda_available = is_cuda_available
+
+    def extract_feat(self, imgs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def encode_decode(self, img, img_metas):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def simple_test(self, img: torch.Tensor, img_meta: Iterable,
+                    **kwargs) -> list:
+        if not self.is_cuda_available:
+            img = img.detach().cpu()
+        elif self.device_id >= 0:
+            img = img.cuda(self.device_id)
+        device_type = img.device.type
+        self.io_binding.bind_input(
+            name='input',
+            device_type=device_type,
+            device_id=self.device_id,
+            element_type=np.float32,
+            shape=img.shape,
+            buffer_ptr=img.data_ptr())
+        self.sess.run_with_iobinding(self.io_binding)
+        seg_pred = self.io_binding.copy_outputs_to_cpu()[0]
+        # whole might support dynamic reshape
+        ori_shape = img_meta[0]['ori_shape']
+        if not (ori_shape[0] == seg_pred.shape[-2]
+                and ori_shape[1] == seg_pred.shape[-1]):
+            seg_pred = torch.from_numpy(seg_pred).float()
+            seg_pred = resize(
+                seg_pred, size=tuple(ori_shape[:2]), mode='nearest')
+            seg_pred = seg_pred.long().detach().cpu().numpy()
+        seg_pred = seg_pred[0]
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+
+class TensorRTSegmentor(BaseSegmentor):
+
+    def __init__(self, trt_file: str, cfg: Any, device_id: int):
+        super(TensorRTSegmentor, self).__init__()
+        from mmcv.tensorrt import TRTWraper, load_tensorrt_plugin
+        try:
+            load_tensorrt_plugin()
+        except (ImportError, ModuleNotFoundError):
+            warnings.warn('If input model has custom op from mmcv, \
+                you may have to build mmcv with TensorRT from source.')
+        model = TRTWraper(
+            trt_file, input_names=['input'], output_names=['output'])
+
+        self.model = model
+        self.device_id = device_id
+        self.cfg = cfg
+        self.test_mode = cfg.model.test_cfg.mode
+
+    def extract_feat(self, imgs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def encode_decode(self, img, img_metas):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def simple_test(self, img: torch.Tensor, img_meta: Iterable,
+                    **kwargs) -> list:
+        with torch.cuda.device(self.device_id), torch.no_grad():
+            seg_pred = self.model({'input': img})['output']
+        seg_pred = seg_pred.detach().cpu().numpy()
+        # whole might support dynamic reshape
+        ori_shape = img_meta[0]['ori_shape']
+        if not (ori_shape[0] == seg_pred.shape[-2]
+                and ori_shape[1] == seg_pred.shape[-1]):
+            seg_pred = torch.from_numpy(seg_pred).float()
+            seg_pred = resize(
+                seg_pred, size=tuple(ori_shape[:2]), mode='nearest')
+            seg_pred = seg_pred.long().detach().cpu().numpy()
+        seg_pred = seg_pred[0]
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description='mmseg backend test (and eval)')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('model', help='Input model file')
+    parser.add_argument(
+        '--backend',
+        help='Backend of the model.',
+        choices=['onnxruntime', 'tensorrt'])
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "mIoU"'
+        ' for generic datasets, and "cityscapes" for Cityscapes')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help="--options is deprecated in favor of --cfg_options' and it will "
+        'not be supported in version v0.22.0. Override some settings in the '
+        'used config, the key-value pair in xxx=yyy format will be merged '
+        'into config file. If the value to be overwritten is a list, it '
+        'should be like key="[a,b]" or key=a,b It also allows nested '
+        'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation '
+        'marks are necessary and that no white space is allowed.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options. '
+            '--options will not be supported in version v0.22.0.')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options. '
+                      '--options will not be supported in version v0.22.0.')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # init distributed env first, since logger depends on the dist info.
+    distributed = False
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # load onnx config and meta
+    cfg.model.train_cfg = None
+
+    if args.backend == 'onnxruntime':
+        model = ONNXRuntimeSegmentor(args.model, cfg=cfg, device_id=0)
+    elif args.backend == 'tensorrt':
+        model = TensorRTSegmentor(args.model, cfg=cfg, device_id=0)
+
+    model.CLASSES = dataset.CLASSES
+    model.PALETTE = dataset.PALETTE
+
+    # clean gpu memory when starting a new evaluation.
+    torch.cuda.empty_cache()
+    eval_kwargs = {} if args.eval_options is None else args.eval_options
+
+    # Deprecated
+    efficient_test = eval_kwargs.get('efficient_test', False)
+    if efficient_test:
+        warnings.warn(
+            '``efficient_test=True`` does not have effect in tools/test.py, '
+            'the evaluation and format results are CPU memory efficient by '
+            'default')
+
+    eval_on_format_results = (
+        args.eval is not None and 'cityscapes' in args.eval)
+    if eval_on_format_results:
+        assert len(args.eval) == 1, 'eval on format results is not ' \
+                                    'applicable for metrics other than ' \
+                                    'cityscapes'
+    if args.format_only or eval_on_format_results:
+        if 'imgfile_prefix' in eval_kwargs:
+            tmpdir = eval_kwargs['imgfile_prefix']
+        else:
+            tmpdir = '.format_cityscapes'
+            eval_kwargs.setdefault('imgfile_prefix', tmpdir)
+        mmcv.mkdir_or_exist(tmpdir)
+    else:
+        tmpdir = None
+
+    model = MMDataParallel(model, device_ids=[0])
+    results = single_gpu_test(
+        model,
+        data_loader,
+        args.show,
+        args.show_dir,
+        False,
+        args.opacity,
+        pre_eval=args.eval is not None and not eval_on_format_results,
+        format_only=args.format_only or eval_on_format_results,
+        format_args=eval_kwargs)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            warnings.warn(
+                'The behavior of ``args.out`` has been changed since MMSeg '
+                'v0.16, the pickled outputs could be seg map as type of '
+                'np.array, pre-eval results or file paths for '
+                '``dataset.format_results()``.')
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(results, args.out)
+        if args.eval:
+            dataset.evaluate(results, args.eval, **eval_kwargs)
+        if tmpdir is not None and eval_on_format_results:
+            # remove tmp dir when cityscapes evaluation
+            shutil.rmtree(tmpdir)
+
+
+if __name__ == '__main__':
+    main()
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_test.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..89711fd5c02cfc1f0386e5354506d4b74ecac251
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_test.sh
@@ -0,0 +1,20 @@
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_train.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a857df78788edb8841b6f67d74dd0e6cfb77d8ab
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/dist_train.sh
@@ -0,0 +1,17 @@
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/get_flops.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd6efd125b15d00e6f84e2b58968a003bc9db8fb
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/get_flops.py
@@ -0,0 +1,73 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+from mmcv import Config
+from mmcv.cnn import get_model_complexity_info
+
+from mmseg.models import build_segmentor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Get the FLOPs of a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[2048, 1024],
+        help='input image size')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    model = build_segmentor(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg')).cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(model, input_shape)
+    split_line = '=' * 30
+    print('{0}\nInput shape: {1}\nFlops: {2}\nParams: {3}\n{0}'.format(
+        split_line, input_shape, flops, params))
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/onnx2tensorrt.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/onnx2tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6983575d70fe0da2f021712a33040cfdb12ae36
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/onnx2tensorrt.py
@@ -0,0 +1,302 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import warnings
+from typing import Iterable, Optional, Union
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+import onnxruntime as ort
+import torch
+from mmcv.ops import get_onnxruntime_op_path
+from mmcv.tensorrt import (TRTWraper, is_tensorrt_plugin_loaded, onnx2trt,
+                           save_trt_engine)
+
+from mmseg.apis.inference import LoadImage
+from mmseg.datasets import DATASETS
+from mmseg.datasets.pipelines import Compose
+
+
+def get_GiB(x: int):
+    """return x GiB."""
+    return x * (1 << 30)
+
+
+def _prepare_input_img(img_path: str,
+                       test_pipeline: Iterable[dict],
+                       shape: Optional[Iterable] = None,
+                       rescale_shape: Optional[Iterable] = None) -> dict:
+    # build the data pipeline
+    if shape is not None:
+        test_pipeline[1]['img_scale'] = (shape[1], shape[0])
+    test_pipeline[1]['transforms'][0]['keep_ratio'] = False
+    test_pipeline = [LoadImage()] + test_pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = dict(img=img_path)
+    data = test_pipeline(data)
+    imgs = data['img']
+    img_metas = [i.data for i in data['img_metas']]
+
+    if rescale_shape is not None:
+        for img_meta in img_metas:
+            img_meta['ori_shape'] = tuple(rescale_shape) + (3, )
+
+    mm_inputs = {'imgs': imgs, 'img_metas': img_metas}
+
+    return mm_inputs
+
+
+def _update_input_img(img_list: Iterable, img_meta_list: Iterable):
+    # update img and its meta list
+    N = img_list[0].size(0)
+    img_meta = img_meta_list[0][0]
+    img_shape = img_meta['img_shape']
+    ori_shape = img_meta['ori_shape']
+    pad_shape = img_meta['pad_shape']
+    new_img_meta_list = [[{
+        'img_shape':
+        img_shape,
+        'ori_shape':
+        ori_shape,
+        'pad_shape':
+        pad_shape,
+        'filename':
+        img_meta['filename'],
+        'scale_factor':
+        (img_shape[1] / ori_shape[1], img_shape[0] / ori_shape[0]) * 2,
+        'flip':
+        False,
+    } for _ in range(N)]]
+
+    return img_list, new_img_meta_list
+
+
+def show_result_pyplot(img: Union[str, np.ndarray],
+                       result: np.ndarray,
+                       palette: Optional[Iterable] = None,
+                       fig_size: Iterable[int] = (15, 10),
+                       opacity: float = 0.5,
+                       title: str = '',
+                       block: bool = True):
+    img = mmcv.imread(img)
+    img = img.copy()
+    seg = result[0]
+    seg = mmcv.imresize(seg, img.shape[:2][::-1])
+    palette = np.array(palette)
+    assert palette.shape[1] == 3
+    assert len(palette.shape) == 2
+    assert 0 < opacity <= 1.0
+    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+    for label, color in enumerate(palette):
+        color_seg[seg == label, :] = color
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+
+    img = img * (1 - opacity) + color_seg * opacity
+    img = img.astype(np.uint8)
+
+    plt.figure(figsize=fig_size)
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.title(title)
+    plt.tight_layout()
+    plt.show(block=block)
+
+
+def onnx2tensorrt(onnx_file: str,
+                  trt_file: str,
+                  config: dict,
+                  input_config: dict,
+                  fp16: bool = False,
+                  verify: bool = False,
+                  show: bool = False,
+                  dataset: str = 'CityscapesDataset',
+                  workspace_size: int = 1,
+                  verbose: bool = False):
+    import tensorrt as trt
+    min_shape = input_config['min_shape']
+    max_shape = input_config['max_shape']
+    # create trt engine and wrapper
+    opt_shape_dict = {'input': [min_shape, min_shape, max_shape]}
+    max_workspace_size = get_GiB(workspace_size)
+    trt_engine = onnx2trt(
+        onnx_file,
+        opt_shape_dict,
+        log_level=trt.Logger.VERBOSE if verbose else trt.Logger.ERROR,
+        fp16_mode=fp16,
+        max_workspace_size=max_workspace_size)
+    save_dir, _ = osp.split(trt_file)
+    if save_dir:
+        os.makedirs(save_dir, exist_ok=True)
+    save_trt_engine(trt_engine, trt_file)
+    print(f'Successfully created TensorRT engine: {trt_file}')
+
+    if verify:
+        inputs = _prepare_input_img(
+            input_config['input_path'],
+            config.data.test.pipeline,
+            shape=min_shape[2:])
+
+        imgs = inputs['imgs']
+        img_metas = inputs['img_metas']
+        img_list = [img[None, :] for img in imgs]
+        img_meta_list = [[img_meta] for img_meta in img_metas]
+        # update img_meta
+        img_list, img_meta_list = _update_input_img(img_list, img_meta_list)
+
+        if max_shape[0] > 1:
+            # concate flip image for batch test
+            flip_img_list = [_.flip(-1) for _ in img_list]
+            img_list = [
+                torch.cat((ori_img, flip_img), 0)
+                for ori_img, flip_img in zip(img_list, flip_img_list)
+            ]
+
+        # Get results from ONNXRuntime
+        ort_custom_op_path = get_onnxruntime_op_path()
+        session_options = ort.SessionOptions()
+        if osp.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+        sess = ort.InferenceSession(onnx_file, session_options)
+        sess.set_providers(['CPUExecutionProvider'], [{}])  # use cpu mode
+        onnx_output = sess.run(['output'],
+                               {'input': img_list[0].detach().numpy()})[0][0]
+
+        # Get results from TensorRT
+        trt_model = TRTWraper(trt_file, ['input'], ['output'])
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': img_list[0].contiguous().cuda()})
+        trt_output = trt_outputs['output'][0].cpu().detach().numpy()
+
+        if show:
+            dataset = DATASETS.get(dataset)
+            assert dataset is not None
+            palette = dataset.PALETTE
+
+            show_result_pyplot(
+                input_config['input_path'],
+                (onnx_output[0].astype(np.uint8), ),
+                palette=palette,
+                title='ONNXRuntime',
+                block=False)
+            show_result_pyplot(
+                input_config['input_path'], (trt_output[0].astype(np.uint8), ),
+                palette=palette,
+                title='TensorRT')
+
+        np.testing.assert_allclose(
+            onnx_output, trt_output, rtol=1e-03, atol=1e-05)
+        print('TensorRT and ONNXRuntime output all close.')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMSegmentation models from ONNX to TensorRT')
+    parser.add_argument('config', help='Config file of the model')
+    parser.add_argument('model', help='Path to the input ONNX model')
+    parser.add_argument(
+        '--trt-file', type=str, help='Path to the output TensorRT engine')
+    parser.add_argument(
+        '--max-shape',
+        type=int,
+        nargs=4,
+        default=[1, 3, 400, 600],
+        help='Maximum shape of model input.')
+    parser.add_argument(
+        '--min-shape',
+        type=int,
+        nargs=4,
+        default=[1, 3, 400, 600],
+        help='Minimum shape of model input.')
+    parser.add_argument('--fp16', action='store_true', help='Enable fp16 mode')
+    parser.add_argument(
+        '--workspace-size',
+        type=int,
+        default=1,
+        help='Max workspace size in GiB')
+    parser.add_argument(
+        '--input-img', type=str, default='', help='Image for test')
+    parser.add_argument(
+        '--show', action='store_true', help='Whether to show output results')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='CityscapesDataset',
+        help='Dataset name')
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='Verify the outputs of ONNXRuntime and TensorRT')
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Whether to verbose logging messages while creating \
+                TensorRT engine.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+
+    assert is_tensorrt_plugin_loaded(), 'TensorRT plugin should be compiled.'
+    args = parse_args()
+
+    if not args.input_img:
+        args.input_img = osp.join(osp.dirname(__file__), '../demo/demo.png')
+
+    # check arguments
+    assert osp.exists(args.config), 'Config {} not found.'.format(args.config)
+    assert osp.exists(args.model), \
+        'ONNX model {} not found.'.format(args.model)
+    assert args.workspace_size >= 0, 'Workspace size less than 0.'
+    assert DATASETS.get(args.dataset) is not None, \
+        'Dataset {} does not found.'.format(args.dataset)
+    for max_value, min_value in zip(args.max_shape, args.min_shape):
+        assert max_value >= min_value, \
+            'max_shape should be larger than min shape'
+
+    input_config = {
+        'min_shape': args.min_shape,
+        'max_shape': args.max_shape,
+        'input_path': args.input_img
+    }
+
+    cfg = mmcv.Config.fromfile(args.config)
+    onnx2tensorrt(
+        args.model,
+        args.trt_file,
+        cfg,
+        input_config,
+        fp16=args.fp16,
+        verify=args.verify,
+        show=args.show,
+        dataset=args.dataset,
+        workspace_size=args.workspace_size,
+        verbose=args.verbose)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/print_config.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..9382dc7a3e36ca662387f869d111315c79ed8d81
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/print_config.py
@@ -0,0 +1,82 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import warnings
+
+from mmcv import Config, DictAction
+
+from mmseg.apis import init_segmentor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--graph', action='store_true', help='print the models graph')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help="--options is deprecated in favor of --cfg_options' and it will "
+        'not be supported in version v0.22.0. Override some settings in the '
+        'used config, the key-value pair in xxx=yyy format will be merged '
+        'into config file. If the value to be overwritten is a list, it '
+        'should be like key="[a,b]" or key=a,b It also allows nested '
+        'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation '
+        'marks are necessary and that no white space is allowed.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options. '
+            '--options will not be supported in version v0.22.0.')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options, '
+                      '--options will not be supported in version v0.22.0.')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    print(f'Config:\n{cfg.pretty_text}')
+    # dump config
+    cfg.dump('example.py')
+    # dump models graph
+    if args.graph:
+        model = init_segmentor(args.config, device='cpu')
+        print(f'Model graph:\n{str(model)}')
+        with open('example-graph.txt', 'w') as f:
+            f.writelines(str(model))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/publish_model.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d37c20b2d094a02eab5c632b8323bdbda3af0277
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/publish_model.py
@@ -0,0 +1,49 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2onnx.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..112e48e59fe376d2de3ba862aac870f9ad41e3b5
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2onnx.py
@@ -0,0 +1,418 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import warnings
+from functools import partial
+
+import mmcv
+import numpy as np
+import onnxruntime as rt
+import torch
+import torch._C
+import torch.serialization
+from mmcv import DictAction
+from mmcv.onnx import register_extra_symbolics
+from mmcv.runner import load_checkpoint
+from torch import nn
+
+from mmseg.apis import show_result_pyplot
+from mmseg.apis.inference import LoadImage
+from mmseg.datasets.pipelines import Compose
+from mmseg.models import build_segmentor
+from mmseg.ops import resize
+
+torch.manual_seed(3)
+
+
+def _convert_batchnorm(module):
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm2d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def _demo_mm_inputs(input_shape, num_classes):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+    rng = np.random.RandomState(0)
+    imgs = rng.rand(*input_shape)
+    segs = rng.randint(
+        low=0, high=num_classes - 1, size=(N, 1, H, W)).astype(np.uint8)
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+    } for _ in range(N)]
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_semantic_seg': torch.LongTensor(segs)
+    }
+    return mm_inputs
+
+
+def _prepare_input_img(img_path,
+                       test_pipeline,
+                       shape=None,
+                       rescale_shape=None):
+    # build the data pipeline
+    if shape is not None:
+        test_pipeline[1]['img_scale'] = (shape[1], shape[0])
+    test_pipeline[1]['transforms'][0]['keep_ratio'] = False
+    test_pipeline = [LoadImage()] + test_pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = dict(img=img_path)
+    data = test_pipeline(data)
+    imgs = data['img']
+    img_metas = [i.data for i in data['img_metas']]
+
+    if rescale_shape is not None:
+        for img_meta in img_metas:
+            img_meta['ori_shape'] = tuple(rescale_shape) + (3, )
+
+    mm_inputs = {'imgs': imgs, 'img_metas': img_metas}
+
+    return mm_inputs
+
+
+def _update_input_img(img_list, img_meta_list, update_ori_shape=False):
+    # update img and its meta list
+    N, C, H, W = img_list[0].shape
+    img_meta = img_meta_list[0][0]
+    img_shape = (H, W, C)
+    if update_ori_shape:
+        ori_shape = img_shape
+    else:
+        ori_shape = img_meta['ori_shape']
+    pad_shape = img_shape
+    new_img_meta_list = [[{
+        'img_shape':
+        img_shape,
+        'ori_shape':
+        ori_shape,
+        'pad_shape':
+        pad_shape,
+        'filename':
+        img_meta['filename'],
+        'scale_factor':
+        (img_shape[1] / ori_shape[1], img_shape[0] / ori_shape[0]) * 2,
+        'flip':
+        False,
+    } for _ in range(N)]]
+
+    return img_list, new_img_meta_list
+
+
+def pytorch2onnx(model,
+                 mm_inputs,
+                 opset_version=11,
+                 show=False,
+                 output_file='tmp.onnx',
+                 verify=False,
+                 dynamic_export=False):
+    """Export Pytorch model to ONNX model and verify the outputs are same
+    between Pytorch and ONNX.
+
+    Args:
+        model (nn.Module): Pytorch model we want to export.
+        mm_inputs (dict): Contain the input tensors and img_metas information.
+        opset_version (int): The onnx op version. Default: 11.
+        show (bool): Whether print the computation graph. Default: False.
+        output_file (string): The path to where we store the output ONNX model.
+            Default: `tmp.onnx`.
+        verify (bool): Whether compare the outputs between Pytorch and ONNX.
+            Default: False.
+        dynamic_export (bool): Whether to export ONNX with dynamic axis.
+            Default: False.
+    """
+    model.cpu().eval()
+    test_mode = model.test_cfg.mode
+
+    if isinstance(model.decode_head, nn.ModuleList):
+        num_classes = model.decode_head[-1].num_classes
+    else:
+        num_classes = model.decode_head.num_classes
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    img_list = [img[None, :] for img in imgs]
+    img_meta_list = [[img_meta] for img_meta in img_metas]
+    # update img_meta
+    img_list, img_meta_list = _update_input_img(img_list, img_meta_list)
+
+    # replace original forward function
+    origin_forward = model.forward
+    model.forward = partial(
+        model.forward,
+        img_metas=img_meta_list,
+        return_loss=False,
+        rescale=True)
+    dynamic_axes = None
+    if dynamic_export:
+        if test_mode == 'slide':
+            dynamic_axes = {'input': {0: 'batch'}, 'output': {1: 'batch'}}
+        else:
+            dynamic_axes = {
+                'input': {
+                    0: 'batch',
+                    2: 'height',
+                    3: 'width'
+                },
+                'output': {
+                    1: 'batch',
+                    2: 'height',
+                    3: 'width'
+                }
+            }
+
+    register_extra_symbolics(opset_version)
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (img_list, ),
+            output_file,
+            input_names=['input'],
+            output_names=['output'],
+            export_params=True,
+            keep_initializers_as_inputs=False,
+            verbose=show,
+            opset_version=opset_version,
+            dynamic_axes=dynamic_axes)
+        print(f'Successfully exported ONNX model: {output_file}')
+    model.forward = origin_forward
+
+    if verify:
+        # check by onnx
+        import onnx
+        onnx_model = onnx.load(output_file)
+        onnx.checker.check_model(onnx_model)
+
+        if dynamic_export and test_mode == 'whole':
+            # scale image for dynamic shape test
+            img_list = [resize(_, scale_factor=1.5) for _ in img_list]
+            # concate flip image for batch test
+            flip_img_list = [_.flip(-1) for _ in img_list]
+            img_list = [
+                torch.cat((ori_img, flip_img), 0)
+                for ori_img, flip_img in zip(img_list, flip_img_list)
+            ]
+
+            # update img_meta
+            img_list, img_meta_list = _update_input_img(
+                img_list, img_meta_list, test_mode == 'whole')
+
+        # check the numerical value
+        # get pytorch output
+        with torch.no_grad():
+            pytorch_result = model(img_list, img_meta_list, return_loss=False)
+            pytorch_result = np.stack(pytorch_result, 0)
+
+        # get onnx output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 1)
+        sess = rt.InferenceSession(output_file)
+        onnx_result = sess.run(
+            None, {net_feed_input[0]: img_list[0].detach().numpy()})[0][0]
+        # show segmentation results
+        if show:
+            import os.path as osp
+
+            import cv2
+            img = img_meta_list[0][0]['filename']
+            if not osp.exists(img):
+                img = imgs[0][:3, ...].permute(1, 2, 0) * 255
+                img = img.detach().numpy().astype(np.uint8)
+                ori_shape = img.shape[:2]
+            else:
+                ori_shape = LoadImage()({'img': img})['ori_shape']
+
+            # resize onnx_result to ori_shape
+            onnx_result_ = cv2.resize(onnx_result[0].astype(np.uint8),
+                                      (ori_shape[1], ori_shape[0]))
+            show_result_pyplot(
+                model,
+                img, (onnx_result_, ),
+                palette=model.PALETTE,
+                block=False,
+                title='ONNXRuntime',
+                opacity=0.5)
+
+            # resize pytorch_result to ori_shape
+            pytorch_result_ = cv2.resize(pytorch_result[0].astype(np.uint8),
+                                         (ori_shape[1], ori_shape[0]))
+            show_result_pyplot(
+                model,
+                img, (pytorch_result_, ),
+                title='PyTorch',
+                palette=model.PALETTE,
+                opacity=0.5)
+        # compare results
+        np.testing.assert_allclose(
+            pytorch_result.astype(np.float32) / num_classes,
+            onnx_result.astype(np.float32) / num_classes,
+            rtol=1e-5,
+            atol=1e-5,
+            err_msg='The outputs are different between Pytorch and ONNX')
+        print('The outputs are same between Pytorch and ONNX')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert MMSeg to ONNX')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file', default=None)
+    parser.add_argument(
+        '--input-img', type=str, help='Images for input', default=None)
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='show onnx graph and segmentation results')
+    parser.add_argument(
+        '--verify', action='store_true', help='verify the onnx model')
+    parser.add_argument('--output-file', type=str, default='tmp.onnx')
+    parser.add_argument('--opset-version', type=int, default=11)
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='input image height and width.')
+    parser.add_argument(
+        '--rescale_shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='output image rescale height and width, work for slide mode.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='Override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--dynamic-export',
+        action='store_true',
+        help='Whether to export onnx with dynamic axis.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg.model.pretrained = None
+
+    if args.shape is None:
+        img_scale = cfg.test_pipeline[1]['img_scale']
+        input_shape = (1, 3, img_scale[1], img_scale[0])
+    elif len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (
+            1,
+            3,
+        ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    test_mode = cfg.model.test_cfg.mode
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    segmentor = build_segmentor(
+        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    # convert SyncBN to BN
+    segmentor = _convert_batchnorm(segmentor)
+
+    if args.checkpoint:
+        checkpoint = load_checkpoint(
+            segmentor, args.checkpoint, map_location='cpu')
+        segmentor.CLASSES = checkpoint['meta']['CLASSES']
+        segmentor.PALETTE = checkpoint['meta']['PALETTE']
+
+    # read input or create dummpy input
+    if args.input_img is not None:
+        preprocess_shape = (input_shape[2], input_shape[3])
+        rescale_shape = None
+        if args.rescale_shape is not None:
+            rescale_shape = [args.rescale_shape[0], args.rescale_shape[1]]
+        mm_inputs = _prepare_input_img(
+            args.input_img,
+            cfg.data.test.pipeline,
+            shape=preprocess_shape,
+            rescale_shape=rescale_shape)
+    else:
+        if isinstance(segmentor.decode_head, nn.ModuleList):
+            num_classes = segmentor.decode_head[-1].num_classes
+        else:
+            num_classes = segmentor.decode_head.num_classes
+        mm_inputs = _demo_mm_inputs(input_shape, num_classes)
+
+    # convert model to onnx file
+    pytorch2onnx(
+        segmentor,
+        mm_inputs,
+        opset_version=args.opset_version,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify,
+        dynamic_export=args.dynamic_export)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2torchscript.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2torchscript.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e9b75f0adcd17eb0155fb7da755cc3374d644f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/pytorch2torchscript.py
@@ -0,0 +1,198 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import mmcv
+import numpy as np
+import torch
+import torch._C
+import torch.serialization
+from mmcv.runner import load_checkpoint
+from torch import nn
+
+from mmseg.models import build_segmentor
+
+torch.manual_seed(3)
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+def check_torch_version():
+    torch_minimum_version = '1.8.0'
+    torch_version = digit_version(torch.__version__)
+
+    assert (torch_version >= digit_version(torch_minimum_version)), \
+        f'Torch=={torch.__version__} is not support for converting to ' \
+        f'torchscript. Please install pytorch>={torch_minimum_version}.'
+
+
+def _convert_batchnorm(module):
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm2d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def _demo_mm_inputs(input_shape, num_classes):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+    rng = np.random.RandomState(0)
+    imgs = rng.rand(*input_shape)
+    segs = rng.randint(
+        low=0, high=num_classes - 1, size=(N, 1, H, W)).astype(np.uint8)
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+    } for _ in range(N)]
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_semantic_seg': torch.LongTensor(segs)
+    }
+    return mm_inputs
+
+
+def pytorch2libtorch(model,
+                     input_shape,
+                     show=False,
+                     output_file='tmp.pt',
+                     verify=False):
+    """Export Pytorch model to TorchScript model and verify the outputs are
+    same between Pytorch and TorchScript.
+
+    Args:
+        model (nn.Module): Pytorch model we want to export.
+        input_shape (tuple): Use this input shape to construct
+            the corresponding dummy input and execute the model.
+        show (bool): Whether print the computation graph. Default: False.
+        output_file (string): The path to where we store the
+            output TorchScript model. Default: `tmp.pt`.
+        verify (bool): Whether compare the outputs between
+            Pytorch and TorchScript. Default: False.
+    """
+    if isinstance(model.decode_head, nn.ModuleList):
+        num_classes = model.decode_head[-1].num_classes
+    else:
+        num_classes = model.decode_head.num_classes
+
+    mm_inputs = _demo_mm_inputs(input_shape, num_classes)
+
+    imgs = mm_inputs.pop('imgs')
+
+    # replace the original forword with forward_dummy
+    model.forward = model.forward_dummy
+    model.eval()
+    traced_model = torch.jit.trace(
+        model,
+        example_inputs=imgs,
+        check_trace=verify,
+    )
+
+    if show:
+        print(traced_model.graph)
+
+    traced_model.save(output_file)
+    print('Successfully exported TorchScript model: {}'.format(output_file))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMSeg to TorchScript')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file', default=None)
+    parser.add_argument(
+        '--show', action='store_true', help='show TorchScript graph')
+    parser.add_argument(
+        '--verify', action='store_true', help='verify the TorchScript model')
+    parser.add_argument('--output-file', type=str, default='tmp.pt')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[512, 512],
+        help='input image size (height, width)')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    check_torch_version()
+
+    if len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (
+            1,
+            3,
+        ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    cfg.model.pretrained = None
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    segmentor = build_segmentor(
+        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    # convert SyncBN to BN
+    segmentor = _convert_batchnorm(segmentor)
+
+    if args.checkpoint:
+        load_checkpoint(segmentor, args.checkpoint, map_location='cpu')
+
+    # convert the PyTorch model to LibTorch model
+    pytorch2libtorch(
+        segmentor,
+        input_shape,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_test.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e6f7bf4e33267f269cf0f455924cb70166ccd4b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-4}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_train.sh b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ab232105f0309c720ed81a522eca14b6fbd64afd
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/slurm_train.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-4}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:4}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/test.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c275bfa9c1de903d90d9dfa15f753d86d931a78
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/test.py
@@ -0,0 +1,333 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import os.path as osp
+import shutil
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv.cnn.utils import revert_sync_batchnorm
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+from mmcv.utils import DictAction
+
+from mmseg import digit_version
+from mmseg.apis import multi_gpu_test, single_gpu_test
+from mmseg.datasets import build_dataloader, build_dataset
+from mmseg.models import build_segmentor
+from mmseg.utils import build_ddp, build_dp, get_device, setup_multi_processes
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='mmseg test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help=('if specified, the evaluation metric results will be dumped'
+              'into the directory as json'))
+    parser.add_argument(
+        '--aug-test', action='store_true', help='Use Flip and Multi scale aug')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "mIoU"'
+        ' for generic datasets, and "cityscapes" for Cityscapes')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu_collect is not specified')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help="--options is deprecated in favor of --cfg_options' and it will "
+        'not be supported in version v0.22.0. Override some settings in the '
+        'used config, the key-value pair in xxx=yyy format will be merged '
+        'into config file. If the value to be overwritten is a list, it '
+        'should be like key="[a,b]" or key=a,b It also allows nested '
+        'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation '
+        'marks are necessary and that no white space is allowed.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options. '
+            '--options will not be supported in version v0.22.0.')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options. '
+                      '--options will not be supported in version v0.22.0.')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    if args.aug_test:
+        # hard code index
+        cfg.data.test.pipeline[1].img_ratios = [
+            0.5, 0.75, 1.0, 1.25, 1.5, 1.75
+        ]
+        cfg.data.test.pipeline[1].flip = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    if args.gpu_id is not None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        cfg.gpu_ids = [args.gpu_id]
+        distributed = False
+        if len(cfg.gpu_ids) > 1:
+            warnings.warn(f'The gpu-ids is reset from {cfg.gpu_ids} to '
+                          f'{cfg.gpu_ids[0:1]} to avoid potential error in '
+                          'non-distribute testing time.')
+            cfg.gpu_ids = cfg.gpu_ids[0:1]
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    rank, _ = get_dist_info()
+    # allows not to create
+    if args.work_dir is not None and rank == 0:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        if args.aug_test:
+            json_file = osp.join(args.work_dir,
+                                 f'eval_multi_scale_{timestamp}.json')
+        else:
+            json_file = osp.join(args.work_dir,
+                                 f'eval_single_scale_{timestamp}.json')
+    elif rank == 0:
+        work_dir = osp.join('./work_dirs',
+                            osp.splitext(osp.basename(args.config))[0])
+        mmcv.mkdir_or_exist(osp.abspath(work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        if args.aug_test:
+            json_file = osp.join(work_dir,
+                                 f'eval_multi_scale_{timestamp}.json')
+        else:
+            json_file = osp.join(work_dir,
+                                 f'eval_single_scale_{timestamp}.json')
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    # The default loader config
+    loader_cfg = dict(
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        shuffle=False)
+    # The overall dataloader settings
+    loader_cfg.update({
+        k: v
+        for k, v in cfg.data.items() if k not in [
+            'train', 'val', 'test', 'train_dataloader', 'val_dataloader',
+            'test_dataloader'
+        ]
+    })
+    test_loader_cfg = {
+        **loader_cfg,
+        'samples_per_gpu': 1,
+        'shuffle': False,  # Not shuffle by default
+        **cfg.data.get('test_dataloader', {})
+    }
+    # build the dataloader
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        print('"CLASSES" not found in meta, use dataset.CLASSES instead')
+        model.CLASSES = dataset.CLASSES
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    else:
+        print('"PALETTE" not found in meta, use dataset.PALETTE instead')
+        model.PALETTE = dataset.PALETTE
+
+    # clean gpu memory when starting a new evaluation.
+    torch.cuda.empty_cache()
+    eval_kwargs = {} if args.eval_options is None else args.eval_options
+
+    # Deprecated
+    efficient_test = eval_kwargs.get('efficient_test', False)
+    if efficient_test:
+        warnings.warn(
+            '``efficient_test=True`` does not have effect in tools/test.py, '
+            'the evaluation and format results are CPU memory efficient by '
+            'default')
+
+    eval_on_format_results = (
+        args.eval is not None and 'cityscapes' in args.eval)
+    if eval_on_format_results:
+        assert len(args.eval) == 1, 'eval on format results is not ' \
+                                    'applicable for metrics other than ' \
+                                    'cityscapes'
+    if args.format_only or eval_on_format_results:
+        if 'imgfile_prefix' in eval_kwargs:
+            tmpdir = eval_kwargs['imgfile_prefix']
+        else:
+            tmpdir = '.format_cityscapes'
+            eval_kwargs.setdefault('imgfile_prefix', tmpdir)
+        mmcv.mkdir_or_exist(tmpdir)
+    else:
+        tmpdir = None
+
+    cfg.device = get_device()
+    if not distributed:
+        warnings.warn(
+            'SyncBN is only supported with DDP. To be compatible with DP, '
+            'we convert SyncBN to BN. Please use dist_train.sh which can '
+            'avoid this error.')
+        if not torch.cuda.is_available():
+            assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \
+                'Please use MMCV >= 1.4.4 for CPU training!'
+        model = revert_sync_batchnorm(model)
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
+        results = single_gpu_test(
+            model,
+            data_loader,
+            args.show,
+            args.show_dir,
+            False,
+            args.opacity,
+            pre_eval=args.eval is not None and not eval_on_format_results,
+            format_only=args.format_only or eval_on_format_results,
+            format_args=eval_kwargs)
+    else:
+        model = build_ddp(
+            model,
+            cfg.device,
+            device_ids=[int(os.environ['LOCAL_RANK'])],
+            broadcast_buffers=False)
+        results = multi_gpu_test(
+            model,
+            data_loader,
+            args.tmpdir,
+            args.gpu_collect,
+            False,
+            pre_eval=args.eval is not None and not eval_on_format_results,
+            format_only=args.format_only or eval_on_format_results,
+            format_args=eval_kwargs)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            warnings.warn(
+                'The behavior of ``args.out`` has been changed since MMSeg '
+                'v0.16, the pickled outputs could be seg map as type of '
+                'np.array, pre-eval results or file paths for '
+                '``dataset.format_results()``.')
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(results, args.out)
+        if args.eval:
+            eval_kwargs.update(metric=args.eval)
+            metric = dataset.evaluate(results, **eval_kwargs)
+            metric_dict = dict(config=args.config, metric=metric)
+            mmcv.dump(metric_dict, json_file, indent=4)
+            if tmpdir is not None and eval_on_format_results:
+                # remove tmp dir when cityscapes evaluation
+                shutil.rmtree(tmpdir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg2torchserve.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..1520794a3a6db0d689245ecc37526c5cd897de12
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg2torchserve.py
@@ -0,0 +1,124 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import mmcv
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmseg2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts mmsegmentation model (config + checkpoint) to TorchServe
+    `.mar`.
+
+    Args:
+        config_file:
+            In MMSegmentation config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMSegmentation checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+    mmcv.mkdir_or_exist(output_folder)
+
+    config = mmcv.Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmseg_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert mmseg models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmseg2torchserve(args.config, args.checkpoint, args.output_folder,
+                     args.model_name, args.model_version, args.force)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg_handler.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2435ff70b84aa912615892d2d13eb30dd801cb7b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/mmseg_handler.py
@@ -0,0 +1,69 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import os
+
+import cv2
+import mmcv
+import torch
+from mmcv.cnn.utils.sync_bn import revert_sync_batchnorm
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmseg.apis import inference_segmentor, init_segmentor
+
+
+class MMsegHandler(BaseHandler):
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_segmentor(self.config_file, checkpoint, self.device)
+        self.model = revert_sync_batchnorm(self.model)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        results = [inference_segmentor(self.model, img) for img in data]
+        return results
+
+    def postprocess(self, data):
+        output = []
+
+        for image_result in data:
+            _, buffer = cv2.imencode('.png', image_result[0].astype('uint8'))
+            content = buffer.tobytes()
+            output.append(content)
+        return output
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/test_torchserve.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/test_torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f08c978a9afbc30351e090d6e5dc6f8884dfe5f
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/torchserve/test_torchserve.py
@@ -0,0 +1,71 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from io import BytesIO
+
+import matplotlib.pyplot as plt
+import mmcv
+import requests
+
+from mmseg.apis import inference_segmentor, init_segmentor
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Compare result of torchserve and pytorch,'
+        'and visualize them.')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--result-image',
+        type=str,
+        default=None,
+        help='save server output in result-image')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        tmp_res = requests.post(url, image)
+    content = tmp_res.content
+    if args.result_image:
+        with open(args.result_image, 'wb') as out_image:
+            out_image.write(content)
+        plt.imshow(mmcv.imread(args.result_image, 'grayscale'))
+        plt.show()
+    else:
+        plt.imshow(plt.imread(BytesIO(content)))
+        plt.show()
+    model = init_segmentor(args.config, args.checkpoint, args.device)
+    image = mmcv.imread(args.img)
+    result = inference_segmentor(model, image)
+    plt.imshow(result[0])
+    plt.show()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/train.py b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..51d79a21dba914626fc62c5c897dcea467926940
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DPT/tools/train.py
@@ -0,0 +1,257 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.cnn.utils import revert_sync_batchnorm
+from mmcv.runner import get_dist_info, init_dist
+from mmcv.utils import Config, DictAction, get_git_hash
+
+from mmseg import __version__
+from mmseg.apis import init_random_seed, set_random_seed, train_segmentor
+from mmseg.datasets import build_dataset
+from mmseg.models import build_segmentor
+from mmseg.utils import (collect_env, get_device, get_root_logger,
+                         setup_multi_processes)
+
+from apex import amp
+amp.register_half_function(torch, "bmm")
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--load-from', help='the checkpoint file to load weights from')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='(Deprecated, please use --gpu-id) number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff_seed',
+        action='store_true',
+        help='Whether or not set different seeds for different ranks')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help="--options is deprecated in favor of --cfg_options' and it will "
+        'not be supported in version v0.22.0. Override some settings in the '
+        'used config, the key-value pair in xxx=yyy format will be merged '
+        'into config file. If the value to be overwritten is a list, it '
+        'should be like key="[a,b]" or key=a,b It also allows nested '
+        'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation '
+        'marks are necessary and that no white space is allowed.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='pytorch',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--auto-resume',
+        action='store_true',
+        help='resume from the latest checkpoint automatically.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options. '
+            '--options will not be supported in version v0.22.0.')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options. '
+                      '--options will not be supported in version v0.22.0.')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.load_from is not None:
+        cfg.load_from = args.load_from
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    cfg.auto_resume = args.auto_resume
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # gpu_ids is used to calculate iter when resuming checkpoint
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    cfg.device = get_device()
+    seed = init_random_seed(args.seed, device=cfg.device)
+    seed = seed + dist.get_rank() if args.diff_seed else seed
+    logger.info(f'Set random seed to {seed}, '
+                f'deterministic: {args.deterministic}')
+    set_random_seed(seed, deterministic=args.deterministic)
+    cfg.seed = seed
+    meta['seed'] = seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_segmentor(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    # SyncBN is not support for DP
+    if not distributed:
+        warnings.warn(
+            'SyncBN is only supported with DDP. To be compatible with DP, '
+            'we convert SyncBN to BN. Please use dist_train.sh which can '
+            'avoid this error.')
+        model = revert_sync_batchnorm(model)
+
+    logger.info(model)
+
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmseg version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmseg_version=f'{__version__}+{get_git_hash()[:7]}',
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    # passing checkpoint meta for saving best checkpoint
+    meta.update(cfg.checkpoint_config.meta)
+    train_segmentor(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()